DriverTrac/venv/lib/python3.12/site-packages/polars/selectors.py

from __future__ import annotations

import builtins
import contextlib
import datetime as pydatetime
import sys
from collections.abc import Collection, Mapping, Sequence
from decimal import Decimal as PyDecimal
from functools import reduce
from operator import or_
from typing import (
    TYPE_CHECKING,
    Any,
    Literal,
    NoReturn,
    overload,
)

import polars.datatypes.classes as pldt
from polars import functions as F
from polars._utils.parse.expr import _parse_inputs_as_iterable
from polars._utils.unstable import unstable
from polars._utils.various import is_column, re_escape
from polars.datatypes import (
    Binary,
    Boolean,
    Categorical,
    Date,
    String,
    Time,
    is_polars_dtype,
)
from polars.expr import Expr

with contextlib.suppress(ImportError):  # Module not available when building docs
    from polars._plr import PyExpr, PySelector

if sys.version_info >= (3, 10):
    from types import NoneType
else:  # pragma: no cover
    # Define equivalent for older Python versions
    NoneType = type(None)

if TYPE_CHECKING:
    from collections.abc import Iterable

    from polars import DataFrame, LazyFrame
    from polars._typing import PolarsDataType, PythonDataType, TimeUnit

__all__ = [
    # class
    "Selector",
    # functions
    "all",
    "alpha",
    "alphanumeric",
    "array",
    "binary",
    "boolean",
    "by_dtype",
    "by_index",
    "by_name",
    "categorical",
    "contains",
    "date",
    "datetime",
    "decimal",
    "digit",
    "duration",
    "ends_with",
    "enum",
    "exclude",
    "expand_selector",
    "first",
    "float",
    "integer",
    "is_selector",
    "last",
    "list",
    "matches",
    "nested",
    "numeric",
    "signed_integer",
    "starts_with",
    "string",
    "struct",
    "temporal",
    "time",
    "unsigned_integer",
]


@overload
def is_selector(obj: Selector) -> Literal[True]: ...


@overload
def is_selector(obj: Any) -> Literal[False]: ...


def is_selector(obj: Any) -> bool:
    """
    Indicate whether the given object/expression is a selector.

    Examples
    --------
    >>> from polars.selectors import is_selector
    >>> import polars.selectors as cs
    >>> is_selector(pl.col("colx"))
    False
    >>> is_selector(cs.first() | cs.last())
    True
    """
    return isinstance(obj, Selector)


# TODO: Don't use this as it collects a schema (can be very expensive for LazyFrame).
#  This should move to IR conversion / Rust.
def expand_selector(
    target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
    selector: Selector | Expr,
    *,
    strict: bool = True,
) -> tuple[str, ...]:
    """
    Expand selector to column names, with respect to a specific frame or target schema.

    .. versionadded:: 0.20.30
        The `strict` parameter was added.

    Parameters
    ----------
    target
        A Polars DataFrame, LazyFrame or Schema.
    selector
        An arbitrary polars selector (or compound selector).
    strict
        Setting False additionally allows for a broader range of column selection
        expressions (such as bare columns or use of `.exclude()`) to be expanded,
        not just the dedicated selectors.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "colx": ["a", "b", "c"],
    ...         "coly": [123, 456, 789],
    ...         "colz": [2.0, 5.5, 8.0],
    ...     }
    ... )

    Expand selector with respect to an existing `DataFrame`:

    >>> cs.expand_selector(df, cs.numeric())
    ('coly', 'colz')
    >>> cs.expand_selector(df, cs.first() | cs.last())
    ('colx', 'colz')

    This also works with `LazyFrame`:

    >>> cs.expand_selector(df.lazy(), ~(cs.first() | cs.last()))
    ('coly',)

    Expand selector with respect to a standalone `Schema` dict:

    >>> schema = {
    ...     "id": pl.Int64,
    ...     "desc": pl.String,
    ...     "count": pl.UInt32,
    ...     "value": pl.Float64,
    ... }
    >>> cs.expand_selector(schema, cs.string() | cs.float())
    ('desc', 'value')

    Allow for non-strict selection expressions (such as those
    including use of an `.exclude()` constraint) to be expanded:

    >>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
    ('count', 'value')
    """
    if isinstance(target, Mapping):
        from polars.dataframe import DataFrame

        target = DataFrame(schema=target)

    if not (
        is_selector(selector)
        if strict
        else selector.meta.is_column_selection(allow_aliasing=False)
    ):
        msg = f"expected a selector; found {selector!r} instead."
        raise TypeError(msg)

    return tuple(target.select(selector).collect_schema())


# TODO: Don't use this as it collects a schema (can be very expensive for LazyFrame).
#  This should move to IR conversion / Rust.
def _expand_selectors(frame: DataFrame | LazyFrame, *items: Any) -> builtins.list[Any]:
    """
    Internal function that expands any selectors to column names in the given input.

    Non-selector values are left as-is.

    Examples
    --------
    >>> from polars.selectors import _expand_selectors
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "colw": ["a", "b"],
    ...         "colx": ["x", "y"],
    ...         "coly": [123, 456],
    ...         "colz": [2.0, 5.5],
    ...     }
    ... )
    >>> _expand_selectors(df, ["colx", cs.numeric()])
    ['colx', 'coly', 'colz']
    >>> _expand_selectors(df, cs.string(), cs.float())
    ['colw', 'colx', 'colz']
    """
    items_iter = _parse_inputs_as_iterable(items)

    expanded: builtins.list[Any] = []
    for item in items_iter:
        if is_selector(item):
            selector_cols = expand_selector(frame, item)
            expanded.extend(selector_cols)
        else:
            expanded.append(item)
    return expanded


def _expand_selector_dicts(
    df: DataFrame,
    d: Mapping[Any, Any] | None,
    *,
    expand_keys: bool,
    expand_values: bool,
    tuple_keys: bool = False,
) -> dict[str, Any]:
    """Expand dict key/value selectors into their underlying column names."""
    expanded = {}
    for key, value in (d or {}).items():
        if expand_values and is_selector(value):
            expanded[key] = expand_selector(df, selector=value)
            value = expanded[key]
        if expand_keys and is_selector(key):
            cols = expand_selector(df, selector=key)
            if tuple_keys:
                expanded[cols] = value
            else:
                expanded.update(dict.fromkeys(cols, value))
        else:
            expanded[key] = value
    return expanded


def _combine_as_selector(
    items: (
        str
        | Expr
        | PolarsDataType
        | Selector
        | Collection[str | Expr | PolarsDataType | Selector]
    ),
    *more_items: str | Expr | PolarsDataType | Selector,
) -> Selector:
    """Create a combined selector from cols, names, dtypes, and/or other selectors."""
    names, regexes, dtypes = [], [], []
    selectors: builtins.list[Selector] = []
    for item in (
        *(
            items
            if isinstance(items, Collection) and not isinstance(items, str)
            else [items]
        ),
        *more_items,
    ):
        if is_selector(item):
            selectors.append(item)
        elif is_polars_dtype(item):
            dtypes.append(item)
        elif isinstance(item, str):
            if item.startswith("^") and item.endswith("$"):
                regexes.append(item)
            else:
                names.append(item)
        elif is_column(item):
            names.append(item.meta.output_name())  # type: ignore[union-attr]
        else:
            msg = f"expected one or more `str`, `DataType` or selector; found {item!r} instead."
            raise TypeError(msg)

    selected = []
    if names:
        selected.append(by_name(*names, require_all=False))
    if dtypes:
        selected.append(by_dtype(*dtypes))
    if regexes:
        selected.append(
            matches(
                "|".join(f"({rx})" for rx in regexes)
                if len(regexes) > 1
                else regexes[0]
            )
        )
    if selectors:
        selected.extend(selectors)

    return reduce(or_, selected)


class Selector(Expr):
    """Base column selector expression/proxy."""

    # NOTE: This `= None` is needed to generate the docs with sphinx_accessor.
    _pyselector: PySelector = None  # type: ignore[assignment]

    @classmethod
    def _from_pyselector(cls, pyselector: PySelector) -> Selector:
        slf = cls()
        slf._pyselector = pyselector
        slf._pyexpr = PyExpr.new_selector(pyselector)
        return slf

    def __getstate__(self) -> bytes:
        return self._pyexpr.__getstate__()

    def __setstate__(self, state: bytes) -> None:
        self._pyexpr = F.lit(0)._pyexpr  # Initialize with a dummy
        self._pyexpr.__setstate__(state)
        self._pyselector = self.meta.as_selector()._pyselector

    def __repr__(self) -> str:
        return str(Expr._from_pyexpr(self._pyexpr))

    def __hash__(self) -> int:
        # note: this is a suitable hash for selectors (but NOT expressions in general),
        # as the repr is guaranteed to be unique across all selector/param permutations
        return self._pyselector.hash()

    @classmethod
    def _by_dtype(
        cls, dtypes: builtins.list[PythonDataType | PolarsDataType]
    ) -> Selector:
        selectors = []
        concrete_dtypes = []
        for dt in dtypes:
            if is_polars_dtype(dt):
                if dt is pldt.Datetime:
                    selectors += [datetime()]
                elif isinstance(dt, pldt.Datetime) and dt.time_zone == "*":
                    selectors += [datetime(time_unit=dt.time_unit, time_zone="*")]
                elif dt is pldt.Duration:
                    selectors += [duration()]
                elif dt is pldt.Categorical:
                    selectors += [categorical()]
                elif dt is pldt.Enum:
                    selectors += [enum()]
                elif dt is pldt.List:
                    selectors += [list()]
                elif dt is pldt.Array:
                    selectors += [array()]
                elif dt is pldt.Struct:
                    selectors += [struct()]
                elif dt is pldt.Decimal:
                    selectors += [decimal()]
                else:
                    concrete_dtypes += [dt]
            elif isinstance(dt, type):
                if dt is int:
                    selectors += [integer()]
                elif dt is builtins.float:
                    selectors += [float()]
                elif dt is bool:
                    selectors += [boolean()]
                elif dt is str:
                    concrete_dtypes += [pldt.String()]
                elif dt is bytes:
                    concrete_dtypes += [pldt.Binary()]
                elif dt is object:
                    selectors += [object()]
                elif dt is NoneType:
                    concrete_dtypes += [pldt.Null()]
                elif dt is pydatetime.time:
                    concrete_dtypes += [pldt.Time()]
                elif dt is pydatetime.datetime:
                    selectors += [datetime()]
                elif dt is pydatetime.timedelta:
                    selectors += [duration()]
                elif dt is pydatetime.date:
                    selectors += [date()]
                elif dt is PyDecimal:
                    selectors += [decimal()]
                elif dt is builtins.list or dt is tuple:
                    selectors += [list()]
                else:
                    input_type = (
                        input
                        if type(input) is type
                        else f"of type {type(input).__name__!r}"
                    )
                    input_detail = "" if type(input) is type else f" (given: {input!r})"
                    msg = f"cannot parse input {input_type} into Polars selector{input_detail}"
                    raise TypeError(msg) from None
            else:
                input_type = (
                    input
                    if type(input) is type
                    else f"of type {type(input).__name__!r}"
                )
                input_detail = "" if type(input) is type else f" (given: {input!r})"
                msg = f"cannot parse input {input_type} into Polars selector{input_detail}"
                raise TypeError(msg) from None

        dtype_selector = cls._from_pyselector(PySelector.by_dtype(concrete_dtypes))

        if len(selectors) == 0:
            return dtype_selector

        selector = selectors[0]
        for s in selectors[1:]:
            selector = selector | s
        if len(concrete_dtypes) == 0:
            return selector
        else:
            return dtype_selector | selector

    @classmethod
    def _by_name(cls, names: builtins.list[str], *, strict: bool) -> Selector:
        return cls._from_pyselector(PySelector.by_name(names, strict))

    def __invert__(cls) -> Selector:
        """Invert the selector."""
        return all() - cls

    def __add__(self, other: Any) -> Expr:
        if is_selector(other):
            return self.as_expr().__add__(other.as_expr())
        else:
            return self.as_expr().__add__(other)

    def __radd__(self, other: Any) -> Expr:
        if is_selector(other):
            msg = "unsupported operand type(s) for op: ('Selector' + 'Selector')"
            raise TypeError(msg)
        else:
            return self.as_expr().__radd__(other)

    @overload
    def __and__(self, other: Selector) -> Selector: ...

    @overload
    def __and__(self, other: Any) -> Expr: ...

    def __and__(self, other: Any) -> Selector | Expr:
        if is_column(other):  # @2.0: remove
            colname = other.meta.output_name()
            other = by_name(colname)
        if is_selector(other):
            return Selector._from_pyselector(
                PySelector.intersect(self._pyselector, other._pyselector)
            )
        else:
            return self.as_expr().__and__(other)

    def __rand__(self, other: Any) -> Expr:
        return self.as_expr().__rand__(other)

    @overload
    def __or__(self, other: Selector) -> Selector: ...

    @overload
    def __or__(self, other: Any) -> Expr: ...

    def __or__(self, other: Any) -> Selector | Expr:
        if is_column(other):  # @2.0: remove
            other = by_name(other.meta.output_name())
        if is_selector(other):
            return Selector._from_pyselector(
                PySelector.union(self._pyselector, other._pyselector)
            )
        else:
            return self.as_expr().__or__(other)

    def __ror__(self, other: Any) -> Expr:
        if is_column(other):
            other = by_name(other.meta.output_name())
        return self.as_expr().__ror__(other)

    @overload
    def __sub__(self, other: Selector) -> Selector: ...

    @overload
    def __sub__(self, other: Any) -> Expr: ...

    def __sub__(self, other: Any) -> Selector | Expr:
        if is_selector(other):
            return Selector._from_pyselector(
                PySelector.difference(self._pyselector, other._pyselector)
            )
        else:
            return self.as_expr().__sub__(other)

    def __rsub__(self, other: Any) -> NoReturn:
        msg = "unsupported operand type(s) for op: ('Expr' - 'Selector')"
        raise TypeError(msg)

    @overload
    def __xor__(self, other: Selector) -> Selector: ...

    @overload
    def __xor__(self, other: Any) -> Expr: ...

    def __xor__(self, other: Any) -> Selector | Expr:
        if is_column(other):  # @2.0: remove
            other = by_name(other.meta.output_name())
        if is_selector(other):
            return Selector._from_pyselector(
                PySelector.exclusive_or(self._pyselector, other._pyselector)
            )
        else:
            return self.as_expr().__xor__(other)

    def __rxor__(self, other: Any) -> Expr:
        if is_column(other):  # @2.0: remove
            other = by_name(other.meta.output_name())
        return self.as_expr().__rxor__(other)

    def exclude(
        self,
        columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType],
        *more_columns: str | PolarsDataType,
    ) -> Selector:
        """
        Exclude columns from a multi-column expression.

        Only works after a wildcard or regex column selection, and you cannot provide
        both string column names *and* dtypes (you may prefer to use selectors instead).

        Parameters
        ----------
        columns
            The name or datatype of the column(s) to exclude. Accepts regular expression
            input. Regular expressions should start with `^` and end with `$`.
        *more_columns
            Additional names or datatypes of columns to exclude, specified as positional
            arguments.
        """
        exclude_cols: builtins.list[str] = []
        exclude_dtypes: builtins.list[PolarsDataType] = []
        for item in (
            *(
                columns
                if isinstance(columns, Collection) and not isinstance(columns, str)
                else [columns]
            ),
            *more_columns,
        ):
            if isinstance(item, str):
                exclude_cols.append(item)
            elif is_polars_dtype(item):
                exclude_dtypes.append(item)
            else:
                msg = (
                    "invalid input for `exclude`"
                    f"\n\nExpected one or more `str` or `DataType`; found {item!r} instead."
                )
                raise TypeError(msg)

        if exclude_cols and exclude_dtypes:
            msg = "cannot exclude by both column name and dtype; use a selector instead"
            raise TypeError(msg)
        elif exclude_dtypes:
            return self - by_dtype(exclude_dtypes)
        else:
            return self - by_name(exclude_cols, require_all=False)

    def as_expr(self) -> Expr:
        """
        Materialize the `selector` as a normal expression.

        This ensures that the operators `|`, `&`, `~` and `-`
        are applied on the data and not on the selector sets.

        Examples
        --------
        >>> import polars.selectors as cs
        >>> df = pl.DataFrame(
        ...     {
        ...         "colx": ["aa", "bb", "cc"],
        ...         "coly": [True, False, True],
        ...         "colz": [1, 2, 3],
        ...     }
        ... )

        Inverting the boolean selector will choose the non-boolean columns:

        >>> df.select(~cs.boolean())
        shape: (3, 2)
        ┌──────┬──────┐
        │ colx ┆ colz │
        │ ---  ┆ ---  │
        │ str  ┆ i64  │
        ╞══════╪══════╡
        │ aa   ┆ 1    │
        │ bb   ┆ 2    │
        │ cc   ┆ 3    │
        └──────┴──────┘

        To invert the *values* in the selected boolean columns, we need to
        materialize the selector as a standard expression instead:

        >>> df.select(~cs.boolean().as_expr())
        shape: (3, 1)
        ┌───────┐
        │ coly  │
        │ ---   │
        │ bool  │
        ╞═══════╡
        │ false │
        │ true  │
        │ false │
        └───────┘
        """
        return Expr._from_pyexpr(self._pyexpr)


def _re_string(string: str | Collection[str], *, escape: bool = True) -> str:
    """Return escaped regex, potentially representing multiple string fragments."""
    if isinstance(string, str):
        rx = re_escape(string) if escape else string
    else:
        strings: builtins.list[str] = []
        for st in string:
            if isinstance(st, Collection) and not isinstance(st, str):  # type: ignore[redundant-expr]
                strings.extend(st)
            else:
                strings.append(st)
        rx = "|".join((re_escape(x) if escape else x) for x in strings)
    return f"({rx})"


def empty() -> Selector:
    """
    Select no columns.

    This is useful for composition with other selectors.

    See Also
    --------
    all : Select all columns in the current scope.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> pl.DataFrame({"a": 1, "b": 2}).select(cs.empty())
    shape: (0, 0)
    ┌┐
    ╞╡
    └┘
    """
    return Selector._from_pyselector(PySelector.empty())


def all() -> Selector:
    """
    Select all columns.

    See Also
    --------
    first : Select the first column in the current scope.
    last : Select the last column in the current scope.

    Examples
    --------
    >>> from datetime import date
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "dt": [date(1999, 12, 31), date(2024, 1, 1)],
    ...         "value": [1_234_500, 5_000_555],
    ...     },
    ...     schema_overrides={"value": pl.Int32},
    ... )

    Select all columns, casting them to string:

    >>> df.select(cs.all().cast(pl.String))
    shape: (2, 2)
    ┌────────────┬─────────┐
    │ dt         ┆ value   │
    │ ---        ┆ ---     │
    │ str        ┆ str     │
    ╞════════════╪═════════╡
    │ 1999-12-31 ┆ 1234500 │
    │ 2024-01-01 ┆ 5000555 │
    └────────────┴─────────┘

    Select all columns *except* for those matching the given dtypes:

    >>> df.select(cs.all() - cs.numeric())
    shape: (2, 1)
    ┌────────────┐
    │ dt         │
    │ ---        │
    │ date       │
    ╞════════════╡
    │ 1999-12-31 │
    │ 2024-01-01 │
    └────────────┘
    """
    return Selector._from_pyselector(PySelector.all())


def alpha(ascii_only: bool = False, *, ignore_spaces: bool = False) -> Selector:  # noqa: FBT001
    r"""
    Select all columns with alphabetic names (eg: only letters).

    Parameters
    ----------
    ascii_only
        Indicate whether to consider only ASCII alphabetic characters, or the full
        Unicode range of valid letters (accented, idiographic, etc).
    ignore_spaces
        Indicate whether to ignore the presence of spaces in column names; if so,
        only the other (non-space) characters are considered.

    Notes
    -----
    Matching column names cannot contain *any* non-alphabetic characters. Note
    that the definition of "alphabetic" consists of all valid Unicode alphabetic
    characters (`\p{Alphabetic}`) by default; this can be changed by setting
    `ascii_only=True`.

    Examples
    --------
    >>> import polars as pl
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "no1": [100, 200, 300],
    ...         "café": ["espresso", "latte", "mocha"],
    ...         "t or f": [True, False, None],
    ...         "hmm": ["aaa", "bbb", "ccc"],
    ...         "都市": ["東京", "大阪", "京都"],
    ...     }
    ... )

    Select columns with alphabetic names; note that accented
    characters and kanji are recognised as alphabetic here:

    >>> df.select(cs.alpha())
    shape: (3, 3)
    ┌──────────┬─────┬──────┐
    │ café     ┆ hmm ┆ 都市 │
    │ ---      ┆ --- ┆ ---  │
    │ str      ┆ str ┆ str  │
    ╞══════════╪═════╪══════╡
    │ espresso ┆ aaa ┆ 東京 │
    │ latte    ┆ bbb ┆ 大阪 │
    │ mocha    ┆ ccc ┆ 京都 │
    └──────────┴─────┴──────┘

    Constrain the definition of "alphabetic" to ASCII characters only:

    >>> df.select(cs.alpha(ascii_only=True))
    shape: (3, 1)
    ┌─────┐
    │ hmm │
    │ --- │
    │ str │
    ╞═════╡
    │ aaa │
    │ bbb │
    │ ccc │
    └─────┘

    >>> df.select(cs.alpha(ascii_only=True, ignore_spaces=True))
    shape: (3, 2)
    ┌────────┬─────┐
    │ t or f ┆ hmm │
    │ ---    ┆ --- │
    │ bool   ┆ str │
    ╞════════╪═════╡
    │ true   ┆ aaa │
    │ false  ┆ bbb │
    │ null   ┆ ccc │
    └────────┴─────┘

    Select all columns *except* for those with alphabetic names:

    >>> df.select(~cs.alpha())
    shape: (3, 2)
    ┌─────┬────────┐
    │ no1 ┆ t or f │
    │ --- ┆ ---    │
    │ i64 ┆ bool   │
    ╞═════╪════════╡
    │ 100 ┆ true   │
    │ 200 ┆ false  │
    │ 300 ┆ null   │
    └─────┴────────┘

    >>> df.select(~cs.alpha(ignore_spaces=True))
    shape: (3, 1)
    ┌─────┐
    │ no1 │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 100 │
    │ 200 │
    │ 300 │
    └─────┘
    """
    # note that we need to supply a pattern compatible with the *rust* regex crate
    re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
    re_space = " " if ignore_spaces else ""
    return Selector._from_pyselector(PySelector.matches(f"^[{re_alpha}{re_space}]+$"))


def alphanumeric(
    ascii_only: bool = False,  # noqa: FBT001
    *,
    ignore_spaces: bool = False,
) -> Selector:
    r"""
    Select all columns with alphanumeric names (eg: only letters and the digits 0-9).

    Parameters
    ----------
    ascii_only
        Indicate whether to consider only ASCII alphabetic characters, or the full
        Unicode range of valid letters (accented, idiographic, etc).
    ignore_spaces
        Indicate whether to ignore the presence of spaces in column names; if so,
        only the other (non-space) characters are considered.

    Notes
    -----
    Matching column names cannot contain *any* non-alphabetic or integer characters.
    Note that the definition of "alphabetic" consists of all valid Unicode alphabetic
    characters (`\p{Alphabetic}`) and digit characters (`\d`) by default; this
    can be changed by setting `ascii_only=True`.

    Examples
    --------
    >>> import polars as pl
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "1st_col": [100, 200, 300],
    ...         "flagged": [True, False, True],
    ...         "00prefix": ["01:aa", "02:bb", "03:cc"],
    ...         "last col": ["x", "y", "z"],
    ...     }
    ... )

    Select columns with alphanumeric names:

    >>> df.select(cs.alphanumeric())
    shape: (3, 2)
    ┌─────────┬──────────┐
    │ flagged ┆ 00prefix │
    │ ---     ┆ ---      │
    │ bool    ┆ str      │
    ╞═════════╪══════════╡
    │ true    ┆ 01:aa    │
    │ false   ┆ 02:bb    │
    │ true    ┆ 03:cc    │
    └─────────┴──────────┘

    >>> df.select(cs.alphanumeric(ignore_spaces=True))
    shape: (3, 3)
    ┌─────────┬──────────┬──────────┐
    │ flagged ┆ 00prefix ┆ last col │
    │ ---     ┆ ---      ┆ ---      │
    │ bool    ┆ str      ┆ str      │
    ╞═════════╪══════════╪══════════╡
    │ true    ┆ 01:aa    ┆ x        │
    │ false   ┆ 02:bb    ┆ y        │
    │ true    ┆ 03:cc    ┆ z        │
    └─────────┴──────────┴──────────┘

    Select all columns *except* for those with alphanumeric names:

    >>> df.select(~cs.alphanumeric())
    shape: (3, 2)
    ┌─────────┬──────────┐
    │ 1st_col ┆ last col │
    │ ---     ┆ ---      │
    │ i64     ┆ str      │
    ╞═════════╪══════════╡
    │ 100     ┆ x        │
    │ 200     ┆ y        │
    │ 300     ┆ z        │
    └─────────┴──────────┘

    >>> df.select(~cs.alphanumeric(ignore_spaces=True))
    shape: (3, 1)
    ┌─────────┐
    │ 1st_col │
    │ ---     │
    │ i64     │
    ╞═════════╡
    │ 100     │
    │ 200     │
    │ 300     │
    └─────────┘
    """
    # note that we need to supply patterns compatible with the *rust* regex crate
    re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
    re_digit = "0-9" if ascii_only else r"\d"
    re_space = " " if ignore_spaces else ""
    return Selector._from_pyselector(
        PySelector.matches(f"^[{re_alpha}{re_digit}{re_space}]+$")
    )


def binary() -> Selector:
    """
    Select all binary columns.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    string : Select all string columns (optionally including categoricals).

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame({"a": [b"hello"], "b": ["world"], "c": [b"!"], "d": [":)"]})
    >>> df
    shape: (1, 4)
    ┌──────────┬───────┬────────┬─────┐
    │ a        ┆ b     ┆ c      ┆ d   │
    │ ---      ┆ ---   ┆ ---    ┆ --- │
    │ binary   ┆ str   ┆ binary ┆ str │
    ╞══════════╪═══════╪════════╪═════╡
    │ b"hello" ┆ world ┆ b"!"   ┆ :)  │
    └──────────┴───────┴────────┴─────┘

    Select binary columns and export as a dict:

    >>> df.select(cs.binary()).to_dict(as_series=False)
    {'a': [b'hello'], 'c': [b'!']}

    Select all columns *except* for those that are binary:

    >>> df.select(~cs.binary()).to_dict(as_series=False)
    {'b': ['world'], 'd': [':)']}
    """
    return by_dtype([Binary])


def boolean() -> Selector:
    """
    Select all boolean columns.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame({"n": range(1, 5)}).with_columns(n_even=pl.col("n") % 2 == 0)
    >>> df
    shape: (4, 2)
    ┌─────┬────────┐
    │ n   ┆ n_even │
    │ --- ┆ ---    │
    │ i64 ┆ bool   │
    ╞═════╪════════╡
    │ 1   ┆ false  │
    │ 2   ┆ true   │
    │ 3   ┆ false  │
    │ 4   ┆ true   │
    └─────┴────────┘

    Select and invert boolean columns:

    >>> df.with_columns(is_odd=cs.boolean().not_())
    shape: (4, 3)
    ┌─────┬────────┬────────┐
    │ n   ┆ n_even ┆ is_odd │
    │ --- ┆ ---    ┆ ---    │
    │ i64 ┆ bool   ┆ bool   │
    ╞═════╪════════╪════════╡
    │ 1   ┆ false  ┆ true   │
    │ 2   ┆ true   ┆ false  │
    │ 3   ┆ false  ┆ true   │
    │ 4   ┆ true   ┆ false  │
    └─────┴────────┴────────┘

    Select all columns *except* for those that are boolean:

    >>> df.select(~cs.boolean())
    shape: (4, 1)
    ┌─────┐
    │ n   │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 1   │
    │ 2   │
    │ 3   │
    │ 4   │
    └─────┘
    """
    return by_dtype([Boolean])


def by_dtype(
    *dtypes: (
        PolarsDataType
        | PythonDataType
        | Iterable[PolarsDataType]
        | Iterable[PythonDataType]
    ),
) -> Selector:
    """
    Select all columns matching the given dtypes.

    See Also
    --------
    by_name : Select all columns matching the given names.
    by_index : Select all columns matching the given indices.

    Examples
    --------
    >>> from datetime import date
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "dt": [date(1999, 12, 31), date(2024, 1, 1), date(2010, 7, 5)],
    ...         "value": [1_234_500, 5_000_555, -4_500_000],
    ...         "other": ["foo", "bar", "foo"],
    ...     }
    ... )

    Select all columns with date or string dtypes:

    >>> df.select(cs.by_dtype(pl.Date, pl.String))
    shape: (3, 2)
    ┌────────────┬───────┐
    │ dt         ┆ other │
    │ ---        ┆ ---   │
    │ date       ┆ str   │
    ╞════════════╪═══════╡
    │ 1999-12-31 ┆ foo   │
    │ 2024-01-01 ┆ bar   │
    │ 2010-07-05 ┆ foo   │
    └────────────┴───────┘

    Select all columns that are not of date or string dtype:

    >>> df.select(~cs.by_dtype(pl.Date, pl.String))
    shape: (3, 1)
    ┌──────────┐
    │ value    │
    │ ---      │
    │ i64      │
    ╞══════════╡
    │ 1234500  │
    │ 5000555  │
    │ -4500000 │
    └──────────┘

    Group by string columns and sum the numeric columns:

    >>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by="other")
    shape: (2, 2)
    ┌───────┬──────────┐
    │ other ┆ value    │
    │ ---   ┆ ---      │
    │ str   ┆ i64      │
    ╞═══════╪══════════╡
    │ bar   ┆ 5000555  │
    │ foo   ┆ -3265500 │
    └───────┴──────────┘
    """
    all_dtypes: builtins.list[PolarsDataType | PythonDataType] = []
    for tp in dtypes:
        if is_polars_dtype(tp) or isinstance(tp, type):
            all_dtypes.append(tp)
        elif isinstance(tp, Collection):
            for t in tp:
                if not (is_polars_dtype(t) or isinstance(t, type)):
                    msg = f"invalid dtype: {t!r}"
                    raise TypeError(msg)
                all_dtypes.append(t)
        else:
            msg = f"invalid dtype: {tp!r}"
            raise TypeError(msg)

    return Selector._by_dtype(all_dtypes)


def by_index(
    *indices: int | range | Sequence[int | range], require_all: bool = True
) -> Selector:
    """
    Select all columns matching the given indices (or range objects).

    Parameters
    ----------
    *indices
        One or more column indices (or range objects).
        Negative indexing is supported.
    require_all
        By default, all specified indices must be valid; if any index is out of bounds,
        an error is raised. If set to `False`, out-of-bounds indices are ignored

    Notes
    -----
    Matching columns are returned in the order in which their indexes
    appear in the selector, not the underlying schema order.

    See Also
    --------
    by_dtype : Select all columns matching the given dtypes.
    by_name : Select all columns matching the given names.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "key": ["abc"],
    ...         **{f"c{i:02}": [0.5 * i] for i in range(100)},
    ...     },
    ... )
    >>> print(df)
    shape: (1, 101)
    ┌─────┬─────┬─────┬─────┬───┬──────┬──────┬──────┬──────┐
    │ key ┆ c00 ┆ c01 ┆ c02 ┆ … ┆ c96  ┆ c97  ┆ c98  ┆ c99  │
    │ --- ┆ --- ┆ --- ┆ --- ┆   ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
    │ str ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64  ┆ f64  ┆ f64  ┆ f64  │
    ╞═════╪═════╪═════╪═════╪═══╪══════╪══════╪══════╪══════╡
    │ abc ┆ 0.0 ┆ 0.5 ┆ 1.0 ┆ … ┆ 48.0 ┆ 48.5 ┆ 49.0 ┆ 49.5 │
    └─────┴─────┴─────┴─────┴───┴──────┴──────┴──────┴──────┘

    Select columns by index ("key" column and the two first/last columns):

    >>> df.select(cs.by_index(0, 1, 2, -2, -1))
    shape: (1, 5)
    ┌─────┬─────┬─────┬──────┬──────┐
    │ key ┆ c00 ┆ c01 ┆ c98  ┆ c99  │
    │ --- ┆ --- ┆ --- ┆ ---  ┆ ---  │
    │ str ┆ f64 ┆ f64 ┆ f64  ┆ f64  │
    ╞═════╪═════╪═════╪══════╪══════╡
    │ abc ┆ 0.0 ┆ 0.5 ┆ 49.0 ┆ 49.5 │
    └─────┴─────┴─────┴──────┴──────┘

    Select the "key" column and use a `range` object to select various columns.
    Note that you can freely mix and match integer indices and `range` objects:

    >>> df.select(cs.by_index(0, range(1, 101, 20)))
    shape: (1, 6)
    ┌─────┬─────┬──────┬──────┬──────┬──────┐
    │ key ┆ c00 ┆ c20  ┆ c40  ┆ c60  ┆ c80  │
    │ --- ┆ --- ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
    │ str ┆ f64 ┆ f64  ┆ f64  ┆ f64  ┆ f64  │
    ╞═════╪═════╪══════╪══════╪══════╪══════╡
    │ abc ┆ 0.0 ┆ 10.0 ┆ 20.0 ┆ 30.0 ┆ 40.0 │
    └─────┴─────┴──────┴──────┴──────┴──────┘

    >>> df.select(cs.by_index(0, range(101, 0, -25), require_all=False))
    shape: (1, 5)
    ┌─────┬──────┬──────┬──────┬─────┐
    │ key ┆ c75  ┆ c50  ┆ c25  ┆ c00 │
    │ --- ┆ ---  ┆ ---  ┆ ---  ┆ --- │
    │ str ┆ f64  ┆ f64  ┆ f64  ┆ f64 │
    ╞═════╪══════╪══════╪══════╪═════╡
    │ abc ┆ 37.5 ┆ 25.0 ┆ 12.5 ┆ 0.0 │
    └─────┴──────┴──────┴──────┴─────┘

    Select all columns *except* for the even-indexed ones:

    >>> df.select(~cs.by_index(range(1, 100, 2)))
    shape: (1, 51)
    ┌─────┬─────┬─────┬─────┬───┬──────┬──────┬──────┬──────┐
    │ key ┆ c01 ┆ c03 ┆ c05 ┆ … ┆ c93  ┆ c95  ┆ c97  ┆ c99  │
    │ --- ┆ --- ┆ --- ┆ --- ┆   ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
    │ str ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64  ┆ f64  ┆ f64  ┆ f64  │
    ╞═════╪═════╪═════╪═════╪═══╪══════╪══════╪══════╪══════╡
    │ abc ┆ 0.5 ┆ 1.5 ┆ 2.5 ┆ … ┆ 46.5 ┆ 47.5 ┆ 48.5 ┆ 49.5 │
    └─────┴─────┴─────┴─────┴───┴──────┴──────┴──────┴──────┘
    """
    all_indices: builtins.list[int] = []
    for idx in indices:
        if isinstance(idx, (range, Sequence)):
            all_indices.extend(idx)  # type: ignore[arg-type]
        elif isinstance(idx, int):
            all_indices.append(idx)
        else:
            msg = f"invalid index value: {idx!r}"
            raise TypeError(msg)

    return Selector._from_pyselector(PySelector.by_index(all_indices, require_all))


def by_name(*names: str | Collection[str], require_all: bool = True) -> Selector:
    """
    Select all columns matching the given names.

    .. versionadded:: 0.20.27
      The `require_all` parameter was added.

    Parameters
    ----------
    *names
        One or more names of columns to select.
    require_all
        Whether to match *all* names (the default) or *any* of the names.

    Notes
    -----
    Matching columns are returned in the order in which they are declared in
    the selector, not the underlying schema order.

    See Also
    --------
    by_dtype : Select all columns matching the given dtypes.
    by_index : Select all columns matching the given indices.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [False, True],
    ...     }
    ... )

    Select columns by name:

    >>> df.select(cs.by_name("foo", "bar"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ foo ┆ bar │
    │ --- ┆ --- │
    │ str ┆ i64 │
    ╞═════╪═════╡
    │ x   ┆ 123 │
    │ y   ┆ 456 │
    └─────┴─────┘

    Match *any* of the given columns by name:

    >>> df.select(cs.by_name("baz", "moose", "foo", "bear", require_all=False))
    shape: (2, 2)
    ┌─────┬─────┐
    │ baz ┆ foo │
    │ --- ┆ --- │
    │ f64 ┆ str │
    ╞═════╪═════╡
    │ 2.0 ┆ x   │
    │ 5.5 ┆ y   │
    └─────┴─────┘

    Match all columns *except* for those given:

    >>> df.select(~cs.by_name("foo", "bar"))
    shape: (2, 2)
    ┌─────┬───────┐
    │ baz ┆ zap   │
    │ --- ┆ ---   │
    │ f64 ┆ bool  │
    ╞═════╪═══════╡
    │ 2.0 ┆ false │
    │ 5.5 ┆ true  │
    └─────┴───────┘
    """
    all_names = []
    for nm in names:
        if isinstance(nm, str):
            all_names.append(nm)
        elif isinstance(nm, Collection):
            for n in nm:
                if not isinstance(n, str):
                    msg = f"invalid name: {n!r}"
                    raise TypeError(msg)
                all_names.append(n)
        else:
            msg = f"invalid name: {nm!r}"
            raise TypeError(msg)

    return Selector._by_name(all_names, strict=require_all)


@unstable()
def enum() -> Selector:
    """
    Select all enum columns.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    categorical : Select all categorical columns.
    string : Select all string columns (optionally including categoricals).

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["xx", "yy"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...     },
    ...     schema_overrides={"foo": pl.Enum(["xx", "yy"])},
    ... )

    Select all enum columns:

    >>> df.select(cs.enum())
    shape: (2, 1)
    ┌──────┐
    │ foo  │
    │ ---  │
    │ enum │
    ╞══════╡
    │ xx   │
    │ yy   │
    └──────┘

    Select all columns *except* for those that are enum:

    >>> df.select(~cs.enum())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.enum_())


@unstable()
def list(inner: None | Selector = None) -> Selector:
    """
    Select all list columns.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    array : Select all array columns.
    nested : Select all nested columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [["xx", "yy"], ["x"]],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...     },
    ... )

    Select all list columns:

    >>> df.select(cs.list())
    shape: (2, 1)
    ┌──────────────┐
    │ foo          │
    │ ---          │
    │ list[str]    │
    ╞══════════════╡
    │ ["xx", "yy"] │
    │ ["x"]        │
    └──────────────┘

    Select all columns *except* for those that are list:

    >>> df.select(~cs.list())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘

    Select all list columns with a certain matching inner type:

    >>> df.select(cs.list(cs.string()))
    shape: (2, 1)
    ┌──────────────┐
    │ foo          │
    │ ---          │
    │ list[str]    │
    ╞══════════════╡
    │ ["xx", "yy"] │
    │ ["x"]        │
    └──────────────┘
    >>> df.select(cs.list(cs.integer()))
    shape: (0, 0)
    ┌┐
    ╞╡
    └┘
    """
    inner_s = inner._pyselector if inner is not None else None
    return Selector._from_pyselector(PySelector.list(inner_s))


@unstable()
def array(inner: Selector | None = None, *, width: int | None = None) -> Selector:
    """
    Select all array columns.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    list : Select all list columns.
    nested : Select all nested columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [["xx", "yy"], ["x", "y"]],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...     },
    ...     schema_overrides={"foo": pl.Array(pl.String, 2)},
    ... )

    Select all array columns:

    >>> df.select(cs.array())
    shape: (2, 1)
    ┌───────────────┐
    │ foo           │
    │ ---           │
    │ array[str, 2] │
    ╞═══════════════╡
    │ ["xx", "yy"]  │
    │ ["x", "y"]    │
    └───────────────┘

    Select all columns *except* for those that are array:

    >>> df.select(~cs.array())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘

    Select all array columns with a certain matching inner type:

    >>> df.select(cs.array(cs.string()))
    shape: (2, 1)
    ┌───────────────┐
    │ foo           │
    │ ---           │
    │ array[str, 2] │
    ╞═══════════════╡
    │ ["xx", "yy"]  │
    │ ["x", "y"]    │
    └───────────────┘
    >>> df.select(cs.array(cs.integer()))
    shape: (0, 0)
    ┌┐
    ╞╡
    └┘
    >>> df.select(cs.array(width=2))
    shape: (2, 1)
    ┌───────────────┐
    │ foo           │
    │ ---           │
    │ array[str, 2] │
    ╞═══════════════╡
    │ ["xx", "yy"]  │
    │ ["x", "y"]    │
    └───────────────┘
    >>> df.select(cs.array(width=3))
    shape: (0, 0)
    ┌┐
    ╞╡
    └┘
    """
    inner_s = inner._pyselector if inner is not None else None
    return Selector._from_pyselector(PySelector.array(inner_s, width))


@unstable()
def struct() -> Selector:
    """
    Select all struct columns.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    list : Select all list columns.
    array : Select all array columns.
    nested : Select all nested columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [{"a": "xx", "b": "z"}, {"a": "x", "b": "y"}],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...     },
    ... )

    Select all struct columns:

    >>> df.select(cs.struct())
    shape: (2, 1)
    ┌────────────┐
    │ foo        │
    │ ---        │
    │ struct[2]  │
    ╞════════════╡
    │ {"xx","z"} │
    │ {"x","y"}  │
    └────────────┘

    Select all columns *except* for those that are struct:

    >>> df.select(~cs.struct())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.struct_())


@unstable()
def nested() -> Selector:
    """
    Select all nested columns.

    A nested column is a list, array or struct.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    list : Select all list columns.
    array : Select all array columns.
    struct : Select all struct columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [{"a": "xx", "b": "z"}, {"a": "x", "b": "y"}],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "wow": [[1, 2], [3]],
    ...     },
    ... )

    Select all nested columns:

    >>> df.select(cs.nested())
    shape: (2, 2)
    ┌────────────┬───────────┐
    │ foo        ┆ wow       │
    │ ---        ┆ ---       │
    │ struct[2]  ┆ list[i64] │
    ╞════════════╪═══════════╡
    │ {"xx","z"} ┆ [1, 2]    │
    │ {"x","y"}  ┆ [3]       │
    └────────────┴───────────┘

    Select all columns *except* for those that are nested:

    >>> df.select(~cs.nested())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.nested())


def categorical() -> Selector:
    """
    Select all categorical columns.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    string : Select all string columns (optionally including categoricals).

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["xx", "yy"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...     },
    ...     schema_overrides={"foo": pl.Categorical},
    ... )

    Select all categorical columns:

    >>> df.select(cs.categorical())
    shape: (2, 1)
    ┌─────┐
    │ foo │
    │ --- │
    │ cat │
    ╞═════╡
    │ xx  │
    │ yy  │
    └─────┘

    Select all columns *except* for those that are categorical:

    >>> df.select(~cs.categorical())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.categorical())


def contains(*substring: str) -> Selector:
    """
    Select columns whose names contain the given literal substring(s).

    Parameters
    ----------
    substring
        Substring(s) that matching column names should contain.

    See Also
    --------
    matches : Select all columns that match the given regex pattern.
    ends_with : Select columns that end with the given substring(s).
    starts_with : Select columns that start with the given substring(s).

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [False, True],
    ...     }
    ... )

    Select columns that contain the substring 'ba':

    >>> df.select(cs.contains("ba"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘

    Select columns that contain the substring 'ba' or the letter 'z':

    >>> df.select(cs.contains("ba", "z"))
    shape: (2, 3)
    ┌─────┬─────┬───────┐
    │ bar ┆ baz ┆ zap   │
    │ --- ┆ --- ┆ ---   │
    │ i64 ┆ f64 ┆ bool  │
    ╞═════╪═════╪═══════╡
    │ 123 ┆ 2.0 ┆ false │
    │ 456 ┆ 5.5 ┆ true  │
    └─────┴─────┴───────┘

    Select all columns *except* for those that contain the substring 'ba':

    >>> df.select(~cs.contains("ba"))
    shape: (2, 2)
    ┌─────┬───────┐
    │ foo ┆ zap   │
    │ --- ┆ ---   │
    │ str ┆ bool  │
    ╞═════╪═══════╡
    │ x   ┆ false │
    │ y   ┆ true  │
    └─────┴───────┘
    """
    escaped_substring = _re_string(substring)
    raw_params = f"^.*{escaped_substring}.*$"

    return Selector._from_pyselector(PySelector.matches(raw_params))


def date() -> Selector:
    """
    Select all date columns.

    See Also
    --------
    datetime : Select all datetime columns, optionally filtering by time unit/zone.
    duration : Select all duration columns, optionally filtering by time unit.
    temporal : Select all temporal columns.
    time : Select all time columns.

    Examples
    --------
    >>> from datetime import date, datetime, time
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "dtm": [datetime(2001, 5, 7, 10, 25), datetime(2031, 12, 31, 0, 30)],
    ...         "dt": [date(1999, 12, 31), date(2024, 8, 9)],
    ...         "tm": [time(0, 0, 0), time(23, 59, 59)],
    ...     },
    ... )

    Select all date columns:

    >>> df.select(cs.date())
    shape: (2, 1)
    ┌────────────┐
    │ dt         │
    │ ---        │
    │ date       │
    ╞════════════╡
    │ 1999-12-31 │
    │ 2024-08-09 │
    └────────────┘

    Select all columns *except* for those that are dates:

    >>> df.select(~cs.date())
    shape: (2, 2)
    ┌─────────────────────┬──────────┐
    │ dtm                 ┆ tm       │
    │ ---                 ┆ ---      │
    │ datetime[μs]        ┆ time     │
    ╞═════════════════════╪══════════╡
    │ 2001-05-07 10:25:00 ┆ 00:00:00 │
    │ 2031-12-31 00:30:00 ┆ 23:59:59 │
    └─────────────────────┴──────────┘
    """
    return by_dtype([Date])


def datetime(
    time_unit: TimeUnit | Collection[TimeUnit] | None = None,
    time_zone: (
        str | pydatetime.timezone | Collection[str | pydatetime.timezone | None] | None
    ) = (
        "*",
        None,
    ),
) -> Selector:
    """
    Select all datetime columns, optionally filtering by time unit/zone.

    Parameters
    ----------
    time_unit
        One (or more) of the allowed timeunit precision strings, "ms", "us", and "ns".
        Omit to select columns with any valid timeunit.
    time_zone
        * One or more timezone strings, as defined in zoneinfo (to see valid options
          run `import zoneinfo; zoneinfo.available_timezones()` for a full list).
        * Set `None` to select Datetime columns that do not have a timezone.
        * Set "*" to select Datetime columns that have *any* timezone.

    See Also
    --------
    date : Select all date columns.
    duration : Select all duration columns, optionally filtering by time unit.
    temporal : Select all temporal columns.
    time : Select all time columns.

    Examples
    --------
    >>> from datetime import datetime, date, timezone
    >>> import polars.selectors as cs
    >>> from zoneinfo import ZoneInfo
    >>> tokyo_tz = ZoneInfo("Asia/Tokyo")
    >>> utc_tz = timezone.utc
    >>> df = pl.DataFrame(
    ...     {
    ...         "tstamp_tokyo": [
    ...             datetime(1999, 7, 21, 5, 20, 16, 987654, tzinfo=tokyo_tz),
    ...             datetime(2000, 5, 16, 6, 21, 21, 123465, tzinfo=tokyo_tz),
    ...         ],
    ...         "tstamp_utc": [
    ...             datetime(2023, 4, 10, 12, 14, 16, 999000, tzinfo=utc_tz),
    ...             datetime(2025, 8, 25, 14, 18, 22, 666000, tzinfo=utc_tz),
    ...         ],
    ...         "tstamp": [
    ...             datetime(2000, 11, 20, 18, 12, 16, 600000),
    ...             datetime(2020, 10, 30, 10, 20, 25, 123000),
    ...         ],
    ...         "dt": [date(1999, 12, 31), date(2010, 7, 5)],
    ...     },
    ...     schema_overrides={
    ...         "tstamp_tokyo": pl.Datetime("ns", "Asia/Tokyo"),
    ...         "tstamp_utc": pl.Datetime("us", "UTC"),
    ...     },
    ... )

    Select all datetime columns:

    >>> df.select(cs.datetime())
    shape: (2, 3)
    ┌────────────────────────────────┬─────────────────────────────┬─────────────────────────┐
    │ tstamp_tokyo                   ┆ tstamp_utc                  ┆ tstamp                  │
    │ ---                            ┆ ---                         ┆ ---                     │
    │ datetime[ns, Asia/Tokyo]       ┆ datetime[μs, UTC]           ┆ datetime[μs]            │
    ╞════════════════════════════════╪═════════════════════════════╪═════════════════════════╡
    │ 1999-07-21 05:20:16.987654 JST ┆ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │
    │ 2000-05-16 06:21:21.123465 JST ┆ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │
    └────────────────────────────────┴─────────────────────────────┴─────────────────────────┘

    Select all datetime columns that have 'us' precision:

    >>> df.select(cs.datetime("us"))
    shape: (2, 2)
    ┌─────────────────────────────┬─────────────────────────┐
    │ tstamp_utc                  ┆ tstamp                  │
    │ ---                         ┆ ---                     │
    │ datetime[μs, UTC]           ┆ datetime[μs]            │
    ╞═════════════════════════════╪═════════════════════════╡
    │ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │
    │ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │
    └─────────────────────────────┴─────────────────────────┘

    Select all datetime columns that have *any* timezone:

    >>> df.select(cs.datetime(time_zone="*"))
    shape: (2, 2)
    ┌────────────────────────────────┬─────────────────────────────┐
    │ tstamp_tokyo                   ┆ tstamp_utc                  │
    │ ---                            ┆ ---                         │
    │ datetime[ns, Asia/Tokyo]       ┆ datetime[μs, UTC]           │
    ╞════════════════════════════════╪═════════════════════════════╡
    │ 1999-07-21 05:20:16.987654 JST ┆ 2023-04-10 12:14:16.999 UTC │
    │ 2000-05-16 06:21:21.123465 JST ┆ 2025-08-25 14:18:22.666 UTC │
    └────────────────────────────────┴─────────────────────────────┘

    Select all datetime columns that have a *specific* timezone:

    >>> df.select(cs.datetime(time_zone="UTC"))
    shape: (2, 1)
    ┌─────────────────────────────┐
    │ tstamp_utc                  │
    │ ---                         │
    │ datetime[μs, UTC]           │
    ╞═════════════════════════════╡
    │ 2023-04-10 12:14:16.999 UTC │
    │ 2025-08-25 14:18:22.666 UTC │
    └─────────────────────────────┘

    Select all datetime columns that have NO timezone:

    >>> df.select(cs.datetime(time_zone=None))
    shape: (2, 1)
    ┌─────────────────────────┐
    │ tstamp                  │
    │ ---                     │
    │ datetime[μs]            │
    ╞═════════════════════════╡
    │ 2000-11-20 18:12:16.600 │
    │ 2020-10-30 10:20:25.123 │
    └─────────────────────────┘

    Select all columns *except* for datetime columns:

    >>> df.select(~cs.datetime())
    shape: (2, 1)
    ┌────────────┐
    │ dt         │
    │ ---        │
    │ date       │
    ╞════════════╡
    │ 1999-12-31 │
    │ 2010-07-05 │
    └────────────┘
    """  # noqa: W505
    if time_unit is None:
        time_unit_lst = ["ms", "us", "ns"]
    else:
        time_unit_lst = (
            [time_unit] if isinstance(time_unit, str) else builtins.list(time_unit)
        )

    time_zone_lst: builtins.list[str | pydatetime.timezone | None]
    if time_zone is None:
        time_zone_lst = [None]
    elif time_zone:
        time_zone_lst = (
            [time_zone]
            if isinstance(time_zone, (str, pydatetime.timezone))
            else builtins.list(time_zone)
        )

    return Selector._from_pyselector(PySelector.datetime(time_unit_lst, time_zone_lst))


def decimal() -> Selector:
    """
    Select all decimal columns.

    See Also
    --------
    float : Select all float columns.
    integer : Select all integer columns.
    numeric : Select all numeric columns.

    Examples
    --------
    >>> from decimal import Decimal as D
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [D(123), D(456)],
    ...         "baz": [D("2.0005"), D("-50.5555")],
    ...     },
    ...     schema_overrides={"baz": pl.Decimal(scale=5, precision=10)},
    ... )

    Select all decimal columns:

    >>> df.select(cs.decimal())
    shape: (2, 2)
    ┌───────────────┬───────────────┐
    │ bar           ┆ baz           │
    │ ---           ┆ ---           │
    │ decimal[38,0] ┆ decimal[10,5] │
    ╞═══════════════╪═══════════════╡
    │ 123           ┆ 2.00050       │
    │ 456           ┆ -50.55550     │
    └───────────────┴───────────────┘

    Select all columns *except* the decimal ones:

    >>> df.select(~cs.decimal())
    shape: (2, 1)
    ┌─────┐
    │ foo │
    │ --- │
    │ str │
    ╞═════╡
    │ x   │
    │ y   │
    └─────┘
    """
    # TODO: allow explicit selection by scale/precision?
    return Selector._from_pyselector(PySelector.decimal())


def digit(ascii_only: bool = False) -> Selector:  # noqa: FBT001
    r"""
    Select all columns having names consisting only of digits.

    Notes
    -----
    Matching column names cannot contain *any* non-digit characters. Note that the
    definition of "digit" consists of all valid Unicode digit characters (`\d`)
    by default; this can be changed by setting `ascii_only=True`.

    Examples
    --------
    >>> import polars as pl
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "key": ["aaa", "bbb", "aaa", "bbb", "bbb"],
    ...         "year": [2001, 2001, 2025, 2025, 2001],
    ...         "value": [-25, 100, 75, -15, -5],
    ...     }
    ... ).pivot(
    ...     values="value",
    ...     index="key",
    ...     on="year",
    ...     aggregate_function="sum",
    ... )
    >>> print(df)
    shape: (2, 3)
    ┌─────┬──────┬──────┐
    │ key ┆ 2001 ┆ 2025 │
    │ --- ┆ ---  ┆ ---  │
    │ str ┆ i64  ┆ i64  │
    ╞═════╪══════╪══════╡
    │ aaa ┆ -25  ┆ 75   │
    │ bbb ┆ 95   ┆ -15  │
    └─────┴──────┴──────┘

    Select columns with digit names:

    >>> df.select(cs.digit())
    shape: (2, 2)
    ┌──────┬──────┐
    │ 2001 ┆ 2025 │
    │ ---  ┆ ---  │
    │ i64  ┆ i64  │
    ╞══════╪══════╡
    │ -25  ┆ 75   │
    │ 95   ┆ -15  │
    └──────┴──────┘

    Select all columns *except* for those with digit names:

    >>> df.select(~cs.digit())
    shape: (2, 1)
    ┌─────┐
    │ key │
    │ --- │
    │ str │
    ╞═════╡
    │ aaa │
    │ bbb │
    └─────┘

    Demonstrate use of `ascii_only` flag (by default all valid unicode digits
    are considered, but this can be constrained to ascii 0-9):

    >>> df = pl.DataFrame({"१९९९": [1999], "२०७७": [2077], "3000": [3000]})
    >>> df.select(cs.digit())
    shape: (1, 3)
    ┌──────┬──────┬──────┐
    │ १९९९ ┆ २०७७ ┆ 3000 │
    │ ---  ┆ ---  ┆ ---  │
    │ i64  ┆ i64  ┆ i64  │
    ╞══════╪══════╪══════╡
    │ 1999 ┆ 2077 ┆ 3000 │
    └──────┴──────┴──────┘

    >>> df.select(cs.digit(ascii_only=True))
    shape: (1, 1)
    ┌──────┐
    │ 3000 │
    │ ---  │
    │ i64  │
    ╞══════╡
    │ 3000 │
    └──────┘
    """
    re_digit = r"[0-9]" if ascii_only else r"\d"
    return Selector._from_pyselector(PySelector.matches(rf"^{re_digit}+$"))


def duration(
    time_unit: TimeUnit | Collection[TimeUnit] | None = None,
) -> Selector:
    """
    Select all duration columns, optionally filtering by time unit.

    Parameters
    ----------
    time_unit
        One (or more) of the allowed timeunit precision strings, "ms", "us", and "ns".
        Omit to select columns with any valid timeunit.

    See Also
    --------
    date : Select all date columns.
    datetime : Select all datetime columns, optionally filtering by time unit/zone.
    temporal : Select all temporal columns.
    time : Select all time columns.

    Examples
    --------
    >>> from datetime import date, timedelta
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "dt": [date(2022, 1, 31), date(2025, 7, 5)],
    ...         "td1": [
    ...             timedelta(days=1, milliseconds=123456),
    ...             timedelta(days=1, hours=23, microseconds=987000),
    ...         ],
    ...         "td2": [
    ...             timedelta(days=7, microseconds=456789),
    ...             timedelta(days=14, minutes=999, seconds=59),
    ...         ],
    ...         "td3": [
    ...             timedelta(weeks=4, days=-10, microseconds=999999),
    ...             timedelta(weeks=3, milliseconds=123456, microseconds=1),
    ...         ],
    ...     },
    ...     schema_overrides={
    ...         "td1": pl.Duration("ms"),
    ...         "td2": pl.Duration("us"),
    ...         "td3": pl.Duration("ns"),
    ...     },
    ... )

    Select all duration columns:

    >>> df.select(cs.duration())
    shape: (2, 3)
    ┌────────────────┬─────────────────┬────────────────────┐
    │ td1            ┆ td2             ┆ td3                │
    │ ---            ┆ ---             ┆ ---                │
    │ duration[ms]   ┆ duration[μs]    ┆ duration[ns]       │
    ╞════════════════╪═════════════════╪════════════════════╡
    │ 1d 2m 3s 456ms ┆ 7d 456789µs     ┆ 18d 999999µs       │
    │ 1d 23h 987ms   ┆ 14d 16h 39m 59s ┆ 21d 2m 3s 456001µs │
    └────────────────┴─────────────────┴────────────────────┘

    Select all duration columns that have 'ms' precision:

    >>> df.select(cs.duration("ms"))
    shape: (2, 1)
    ┌────────────────┐
    │ td1            │
    │ ---            │
    │ duration[ms]   │
    ╞════════════════╡
    │ 1d 2m 3s 456ms │
    │ 1d 23h 987ms   │
    └────────────────┘

    Select all duration columns that have 'ms' OR 'ns' precision:

    >>> df.select(cs.duration(["ms", "ns"]))
    shape: (2, 2)
    ┌────────────────┬────────────────────┐
    │ td1            ┆ td3                │
    │ ---            ┆ ---                │
    │ duration[ms]   ┆ duration[ns]       │
    ╞════════════════╪════════════════════╡
    │ 1d 2m 3s 456ms ┆ 18d 999999µs       │
    │ 1d 23h 987ms   ┆ 21d 2m 3s 456001µs │
    └────────────────┴────────────────────┘

    Select all columns *except* for duration columns:

    >>> df.select(~cs.duration())
    shape: (2, 1)
    ┌────────────┐
    │ dt         │
    │ ---        │
    │ date       │
    ╞════════════╡
    │ 2022-01-31 │
    │ 2025-07-05 │
    └────────────┘
    """
    if time_unit is None:
        time_unit = ["ms", "us", "ns"]
    else:
        time_unit = (
            [time_unit] if isinstance(time_unit, str) else builtins.list(time_unit)
        )

    return Selector._from_pyselector(PySelector.duration(time_unit))


def ends_with(*suffix: str) -> Selector:
    """
    Select columns that end with the given substring(s).

    See Also
    --------
    contains : Select columns that contain the given literal substring(s).
    matches : Select all columns that match the given regex pattern.
    starts_with : Select columns that start with the given substring(s).

    Parameters
    ----------
    suffix
        Substring(s) that matching column names should end with.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [False, True],
    ...     }
    ... )

    Select columns that end with the substring 'z':

    >>> df.select(cs.ends_with("z"))
    shape: (2, 1)
    ┌─────┐
    │ baz │
    │ --- │
    │ f64 │
    ╞═════╡
    │ 2.0 │
    │ 5.5 │
    └─────┘

    Select columns that end with *either* the letter 'z' or 'r':

    >>> df.select(cs.ends_with("z", "r"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘

    Select all columns *except* for those that end with the substring 'z':

    >>> df.select(~cs.ends_with("z"))
    shape: (2, 3)
    ┌─────┬─────┬───────┐
    │ foo ┆ bar ┆ zap   │
    │ --- ┆ --- ┆ ---   │
    │ str ┆ i64 ┆ bool  │
    ╞═════╪═════╪═══════╡
    │ x   ┆ 123 ┆ false │
    │ y   ┆ 456 ┆ true  │
    └─────┴─────┴───────┘
    """
    escaped_suffix = _re_string(suffix)
    raw_params = f"^.*{escaped_suffix}$"

    return Selector._from_pyselector(PySelector.matches(raw_params))


def exclude(
    columns: (
        str
        | PolarsDataType
        | Selector
        | Expr
        | Collection[str | PolarsDataType | Selector | Expr]
    ),
    *more_columns: str | PolarsDataType | Selector | Expr,
) -> Selector:
    """
    Select all columns except those matching the given columns, datatypes, or selectors.

    Parameters
    ----------
    columns
        One or more columns (col or name), datatypes, columns, or selectors representing
        the columns to exclude.
    *more_columns
        Additional columns, datatypes, or selectors to exclude, specified as positional
        arguments.

    Notes
    -----
    If excluding a single selector it is simpler to write as `~selector` instead.

    Examples
    --------
    Exclude by column name(s):

    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "aa": [1, 2, 3],
    ...         "ba": ["a", "b", None],
    ...         "cc": [None, 2.5, 1.5],
    ...     }
    ... )
    >>> df.select(cs.exclude("ba", "xx"))
    shape: (3, 2)
    ┌─────┬──────┐
    │ aa  ┆ cc   │
    │ --- ┆ ---  │
    │ i64 ┆ f64  │
    ╞═════╪══════╡
    │ 1   ┆ null │
    │ 2   ┆ 2.5  │
    │ 3   ┆ 1.5  │
    └─────┴──────┘

    Exclude using a column name, a selector, and a dtype:

    >>> df.select(cs.exclude("aa", cs.string(), pl.UInt32))
    shape: (3, 1)
    ┌──────┐
    │ cc   │
    │ ---  │
    │ f64  │
    ╞══════╡
    │ null │
    │ 2.5  │
    │ 1.5  │
    └──────┘
    """
    return ~_combine_as_selector(columns, *more_columns)


def first(*, strict: bool = True) -> Selector:
    """
    Select the first column in the current scope.

    See Also
    --------
    all : Select all columns.
    last : Select the last column in the current scope.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [0, 1],
    ...     }
    ... )

    Select the first column:

    >>> df.select(cs.first())
    shape: (2, 1)
    ┌─────┐
    │ foo │
    │ --- │
    │ str │
    ╞═════╡
    │ x   │
    │ y   │
    └─────┘

    Select everything  *except* for the first column:

    >>> df.select(~cs.first())
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ bar ┆ baz ┆ zap │
    │ --- ┆ --- ┆ --- │
    │ i64 ┆ f64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 123 ┆ 2.0 ┆ 0   │
    │ 456 ┆ 5.5 ┆ 1   │
    └─────┴─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.first(strict))


def float() -> Selector:
    """
    Select all float columns.

    See Also
    --------
    integer : Select all integer columns.
    numeric : Select all numeric columns.
    signed_integer : Select all signed integer columns.
    unsigned_integer : Select all unsigned integer columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [0.0, 1.0],
    ...     },
    ...     schema_overrides={"baz": pl.Float32, "zap": pl.Float64},
    ... )

    Select all float columns:

    >>> df.select(cs.float())
    shape: (2, 2)
    ┌─────┬─────┐
    │ baz ┆ zap │
    │ --- ┆ --- │
    │ f32 ┆ f64 │
    ╞═════╪═════╡
    │ 2.0 ┆ 0.0 │
    │ 5.5 ┆ 1.0 │
    └─────┴─────┘

    Select all columns *except* for those that are float:

    >>> df.select(~cs.float())
    shape: (2, 2)
    ┌─────┬─────┐
    │ foo ┆ bar │
    │ --- ┆ --- │
    │ str ┆ i64 │
    ╞═════╪═════╡
    │ x   ┆ 123 │
    │ y   ┆ 456 │
    └─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.float())


def integer() -> Selector:
    """
    Select all integer columns.

    See Also
    --------
    by_dtype : Select columns by dtype.
    float : Select all float columns.
    numeric : Select all numeric columns.
    signed_integer : Select all signed integer columns.
    unsigned_integer : Select all unsigned integer columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [0, 1],
    ...     }
    ... )

    Select all integer columns:

    >>> df.select(cs.integer())
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ zap │
    │ --- ┆ --- │
    │ i64 ┆ i64 │
    ╞═════╪═════╡
    │ 123 ┆ 0   │
    │ 456 ┆ 1   │
    └─────┴─────┘

    Select all columns *except* for those that are integer :

    >>> df.select(~cs.integer())
    shape: (2, 2)
    ┌─────┬─────┐
    │ foo ┆ baz │
    │ --- ┆ --- │
    │ str ┆ f64 │
    ╞═════╪═════╡
    │ x   ┆ 2.0 │
    │ y   ┆ 5.5 │
    └─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.integer())


def signed_integer() -> Selector:
    """
    Select all signed integer columns.

    See Also
    --------
    by_dtype : Select columns by dtype.
    float : Select all float columns.
    integer : Select all integer columns.
    numeric : Select all numeric columns.
    unsigned_integer : Select all unsigned integer columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [-123, -456],
    ...         "bar": [3456, 6789],
    ...         "baz": [7654, 4321],
    ...         "zap": ["ab", "cd"],
    ...     },
    ...     schema_overrides={"bar": pl.UInt32, "baz": pl.UInt64},
    ... )

    Select all signed integer columns:

    >>> df.select(cs.signed_integer())
    shape: (2, 1)
    ┌──────┐
    │ foo  │
    │ ---  │
    │ i64  │
    ╞══════╡
    │ -123 │
    │ -456 │
    └──────┘

    >>> df.select(~cs.signed_integer())
    shape: (2, 3)
    ┌──────┬──────┬─────┐
    │ bar  ┆ baz  ┆ zap │
    │ ---  ┆ ---  ┆ --- │
    │ u32  ┆ u64  ┆ str │
    ╞══════╪══════╪═════╡
    │ 3456 ┆ 7654 ┆ ab  │
    │ 6789 ┆ 4321 ┆ cd  │
    └──────┴──────┴─────┘

    Select all integer columns (both signed and unsigned):

    >>> df.select(cs.integer())
    shape: (2, 3)
    ┌──────┬──────┬──────┐
    │ foo  ┆ bar  ┆ baz  │
    │ ---  ┆ ---  ┆ ---  │
    │ i64  ┆ u32  ┆ u64  │
    ╞══════╪══════╪══════╡
    │ -123 ┆ 3456 ┆ 7654 │
    │ -456 ┆ 6789 ┆ 4321 │
    └──────┴──────┴──────┘
    """
    return Selector._from_pyselector(PySelector.signed_integer())


def unsigned_integer() -> Selector:
    """
    Select all unsigned integer columns.

    See Also
    --------
    by_dtype : Select columns by dtype.
    float : Select all float columns.
    integer : Select all integer columns.
    numeric : Select all numeric columns.
    signed_integer : Select all signed integer columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [-123, -456],
    ...         "bar": [3456, 6789],
    ...         "baz": [7654, 4321],
    ...         "zap": ["ab", "cd"],
    ...     },
    ...     schema_overrides={"bar": pl.UInt32, "baz": pl.UInt64},
    ... )

    Select all unsigned integer columns:

    >>> df.select(cs.unsigned_integer())
    shape: (2, 2)
    ┌──────┬──────┐
    │ bar  ┆ baz  │
    │ ---  ┆ ---  │
    │ u32  ┆ u64  │
    ╞══════╪══════╡
    │ 3456 ┆ 7654 │
    │ 6789 ┆ 4321 │
    └──────┴──────┘

    Select all columns *except* for those that are unsigned integers:

    >>> df.select(~cs.unsigned_integer())
    shape: (2, 2)
    ┌──────┬─────┐
    │ foo  ┆ zap │
    │ ---  ┆ --- │
    │ i64  ┆ str │
    ╞══════╪═════╡
    │ -123 ┆ ab  │
    │ -456 ┆ cd  │
    └──────┴─────┘

    Select all integer columns (both signed and unsigned):

    >>> df.select(cs.integer())
    shape: (2, 3)
    ┌──────┬──────┬──────┐
    │ foo  ┆ bar  ┆ baz  │
    │ ---  ┆ ---  ┆ ---  │
    │ i64  ┆ u32  ┆ u64  │
    ╞══════╪══════╪══════╡
    │ -123 ┆ 3456 ┆ 7654 │
    │ -456 ┆ 6789 ┆ 4321 │
    └──────┴──────┴──────┘
    """
    return Selector._from_pyselector(PySelector.unsigned_integer())


def last(*, strict: bool = True) -> Selector:
    """
    Select the last column in the current scope.

    See Also
    --------
    all : Select all columns.
    first : Select the first column in the current scope.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [0, 1],
    ...     }
    ... )

    Select the last column:

    >>> df.select(cs.last())
    shape: (2, 1)
    ┌─────┐
    │ zap │
    │ --- │
    │ i64 │
    ╞═════╡
    │ 0   │
    │ 1   │
    └─────┘

    Select everything  *except* for the last column:

    >>> df.select(~cs.last())
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ foo ┆ bar ┆ baz │
    │ --- ┆ --- ┆ --- │
    │ str ┆ i64 ┆ f64 │
    ╞═════╪═════╪═════╡
    │ x   ┆ 123 ┆ 2.0 │
    │ y   ┆ 456 ┆ 5.5 │
    └─────┴─────┴─────┘
    """
    return Selector._from_pyselector(PySelector.last(strict))


def matches(pattern: str) -> Selector:
    """
    Select all columns that match the given regex pattern.

    See Also
    --------
    contains : Select all columns that contain the given substring.
    ends_with : Select all columns that end with the given substring(s).
    starts_with : Select all columns that start with the given substring(s).

    Parameters
    ----------
    pattern
        A valid regular expression pattern, compatible with the `regex crate
        <https://docs.rs/regex/latest/regex/>`_.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [0, 1],
    ...     }
    ... )

    Match column names containing an 'a', preceded by a character that is not 'z':

    >>> df.select(cs.matches("[^z]a"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ i64 ┆ f64 │
    ╞═════╪═════╡
    │ 123 ┆ 2.0 │
    │ 456 ┆ 5.5 │
    └─────┴─────┘

    Do not match column names ending in 'R' or 'z' (case-insensitively):

    >>> df.select(~cs.matches(r"(?i)R|z$"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ foo ┆ zap │
    │ --- ┆ --- │
    │ str ┆ i64 │
    ╞═════╪═════╡
    │ x   ┆ 0   │
    │ y   ┆ 1   │
    └─────┴─────┘
    """
    if pattern == ".*":
        return all()
    else:
        if pattern.startswith(".*"):
            pattern = pattern[2:]
        elif pattern.endswith(".*"):
            pattern = pattern[:-2]

        pfx = "^.*" if not pattern.startswith("^") else ""
        sfx = ".*$" if not pattern.endswith("$") else ""
        raw_params = f"{pfx}{pattern}{sfx}"

        return Selector._from_pyselector(PySelector.matches(raw_params))


def numeric() -> Selector:
    """
    Select all numeric columns.

    See Also
    --------
    by_dtype : Select columns by dtype.
    float : Select all float columns.
    integer : Select all integer columns.
    signed_integer : Select all signed integer columns.
    unsigned_integer : Select all unsigned integer columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": ["x", "y"],
    ...         "bar": [123, 456],
    ...         "baz": [2.0, 5.5],
    ...         "zap": [0, 0],
    ...     },
    ...     schema_overrides={"bar": pl.Int16, "baz": pl.Float32, "zap": pl.UInt8},
    ... )

    Match all numeric columns:

    >>> df.select(cs.numeric())
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ bar ┆ baz ┆ zap │
    │ --- ┆ --- ┆ --- │
    │ i16 ┆ f32 ┆ u8  │
    ╞═════╪═════╪═════╡
    │ 123 ┆ 2.0 ┆ 0   │
    │ 456 ┆ 5.5 ┆ 0   │
    └─────┴─────┴─────┘

    Match all columns *except* for those that are numeric:

    >>> df.select(~cs.numeric())
    shape: (2, 1)
    ┌─────┐
    │ foo │
    │ --- │
    │ str │
    ╞═════╡
    │ x   │
    │ y   │
    └─────┘
    """
    return Selector._from_pyselector(PySelector.numeric())


def object() -> Selector:
    """
    Select all object columns.

    See Also
    --------
    by_dtype : Select columns by dtype.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> from uuid import uuid4
    >>> with pl.Config(fmt_str_lengths=36):
    ...     df = pl.DataFrame(
    ...         {
    ...             "idx": [0, 1],
    ...             "uuid_obj": [uuid4(), uuid4()],
    ...             "uuid_str": [str(uuid4()), str(uuid4())],
    ...         },
    ...         schema_overrides={"idx": pl.Int32},
    ...     )
    ...     print(df)  # doctest: +IGNORE_RESULT
    shape: (2, 3)
    ┌─────┬──────────────────────────────────────┬──────────────────────────────────────┐
    │ idx ┆ uuid_obj                             ┆ uuid_str                             │
    │ --- ┆ ---                                  ┆ ---                                  │
    │ i32 ┆ object                               ┆ str                                  │
    ╞═════╪══════════════════════════════════════╪══════════════════════════════════════╡
    │ 0   ┆ 6be063cf-c9c6-43be-878e-e446cfd42981 ┆ acab9fea-c05d-4b91-b639-418004a63f33 │
    │ 1   ┆ 7849d8f9-2cac-48e7-96d3-63cf81c14869 ┆ 28c65415-8b7d-4857-a4ce-300dca14b12b │
    └─────┴──────────────────────────────────────┴──────────────────────────────────────┘

    Select object columns and export as a dict:

    >>> df.select(cs.object()).to_dict(as_series=False)  # doctest: +IGNORE_RESULT
    {
        "uuid_obj": [
            UUID("6be063cf-c9c6-43be-878e-e446cfd42981"),
            UUID("7849d8f9-2cac-48e7-96d3-63cf81c14869"),
        ]
    }

    Select all columns *except* for those that are object and export as dict:

    >>> df.select(~cs.object())  # doctest: +IGNORE_RESULT
    {
        "idx": [0, 1],
        "uuid_str": [
            "acab9fea-c05d-4b91-b639-418004a63f33",
            "28c65415-8b7d-4857-a4ce-300dca14b12b",
        ],
    }
    """  # noqa: W505
    return Selector._from_pyselector(PySelector.object())


def starts_with(*prefix: str) -> Selector:
    """
    Select columns that start with the given substring(s).

    Parameters
    ----------
    prefix
        Substring(s) that matching column names should start with.

    See Also
    --------
    contains : Select all columns that contain the given substring.
    ends_with : Select all columns that end with the given substring(s).
    matches : Select all columns that match the given regex pattern.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "foo": [1.0, 2.0],
    ...         "bar": [3.0, 4.0],
    ...         "baz": [5, 6],
    ...         "zap": [7, 8],
    ...     }
    ... )

    Match columns starting with a 'b':

    >>> df.select(cs.starts_with("b"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ bar ┆ baz │
    │ --- ┆ --- │
    │ f64 ┆ i64 │
    ╞═════╪═════╡
    │ 3.0 ┆ 5   │
    │ 4.0 ┆ 6   │
    └─────┴─────┘

    Match columns starting with *either* the letter 'b' or 'z':

    >>> df.select(cs.starts_with("b", "z"))
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ bar ┆ baz ┆ zap │
    │ --- ┆ --- ┆ --- │
    │ f64 ┆ i64 ┆ i64 │
    ╞═════╪═════╪═════╡
    │ 3.0 ┆ 5   ┆ 7   │
    │ 4.0 ┆ 6   ┆ 8   │
    └─────┴─────┴─────┘

    Match all columns *except* for those starting with 'b':

    >>> df.select(~cs.starts_with("b"))
    shape: (2, 2)
    ┌─────┬─────┐
    │ foo ┆ zap │
    │ --- ┆ --- │
    │ f64 ┆ i64 │
    ╞═════╪═════╡
    │ 1.0 ┆ 7   │
    │ 2.0 ┆ 8   │
    └─────┴─────┘
    """
    escaped_prefix = _re_string(prefix)
    raw_params = f"^{escaped_prefix}.*$"

    return Selector._from_pyselector(PySelector.matches(raw_params))


def string(*, include_categorical: bool = False) -> Selector:
    """
    Select all String (and, optionally, Categorical) string columns.

    See Also
    --------
    binary : Select all binary columns.
    by_dtype : Select all columns matching the given dtype(s).
    categorical: Select all categorical columns.

    Examples
    --------
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "w": ["xx", "yy", "xx", "yy", "xx"],
    ...         "x": [1, 2, 1, 4, -2],
    ...         "y": [3.0, 4.5, 1.0, 2.5, -2.0],
    ...         "z": ["a", "b", "a", "b", "b"],
    ...     },
    ... ).with_columns(
    ...     z=pl.col("z").cast(pl.Categorical("lexical")),
    ... )

    Group by all string columns, sum the numeric columns, then sort by the string cols:

    >>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string())
    shape: (2, 3)
    ┌─────┬─────┬─────┐
    │ w   ┆ x   ┆ y   │
    │ --- ┆ --- ┆ --- │
    │ str ┆ i64 ┆ f64 │
    ╞═════╪═════╪═════╡
    │ xx  ┆ 0   ┆ 2.0 │
    │ yy  ┆ 6   ┆ 7.0 │
    └─────┴─────┴─────┘

    Group by all string *and* categorical columns:

    >>> df.group_by(cs.string(include_categorical=True)).agg(cs.numeric().sum()).sort(
    ...     by=cs.string(include_categorical=True)
    ... )
    shape: (3, 4)
    ┌─────┬─────┬─────┬──────┐
    │ w   ┆ z   ┆ x   ┆ y    │
    │ --- ┆ --- ┆ --- ┆ ---  │
    │ str ┆ cat ┆ i64 ┆ f64  │
    ╞═════╪═════╪═════╪══════╡
    │ xx  ┆ a   ┆ 2   ┆ 4.0  │
    │ xx  ┆ b   ┆ -2  ┆ -2.0 │
    │ yy  ┆ b   ┆ 6   ┆ 7.0  │
    └─────┴─────┴─────┴──────┘
    """
    string_dtypes: builtins.list[PolarsDataType] = [String]
    if include_categorical:
        string_dtypes.append(Categorical)

    return by_dtype(string_dtypes)


def temporal() -> Selector:
    """
    Select all temporal columns.

    See Also
    --------
    by_dtype : Select all columns matching the given dtype(s).
    date : Select all date columns.
    datetime : Select all datetime columns, optionally filtering by time unit/zone.
    duration : Select all duration columns, optionally filtering by time unit.
    time : Select all time columns.

    Examples
    --------
    >>> from datetime import date, time
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "dt": [date(2021, 1, 1), date(2021, 1, 2)],
    ...         "tm": [time(12, 0, 0), time(20, 30, 45)],
    ...         "value": [1.2345, 2.3456],
    ...     }
    ... )

    Match all temporal columns:

    >>> df.select(cs.temporal())
    shape: (2, 2)
    ┌────────────┬──────────┐
    │ dt         ┆ tm       │
    │ ---        ┆ ---      │
    │ date       ┆ time     │
    ╞════════════╪══════════╡
    │ 2021-01-01 ┆ 12:00:00 │
    │ 2021-01-02 ┆ 20:30:45 │
    └────────────┴──────────┘

    Match all temporal columns *except* for time columns:

    >>> df.select(cs.temporal() - cs.time())
    shape: (2, 1)
    ┌────────────┐
    │ dt         │
    │ ---        │
    │ date       │
    ╞════════════╡
    │ 2021-01-01 │
    │ 2021-01-02 │
    └────────────┘

    Match all columns *except* for temporal columns:

    >>> df.select(~cs.temporal())
    shape: (2, 1)
    ┌────────┐
    │ value  │
    │ ---    │
    │ f64    │
    ╞════════╡
    │ 1.2345 │
    │ 2.3456 │
    └────────┘
    """
    return Selector._from_pyselector(PySelector.temporal())


def time() -> Selector:
    """
    Select all time columns.

    See Also
    --------
    date : Select all date columns.
    datetime : Select all datetime columns, optionally filtering by time unit/zone.
    duration : Select all duration columns, optionally filtering by time unit.
    temporal : Select all temporal columns.

    Examples
    --------
    >>> from datetime import date, datetime, time
    >>> import polars.selectors as cs
    >>> df = pl.DataFrame(
    ...     {
    ...         "dtm": [datetime(2001, 5, 7, 10, 25), datetime(2031, 12, 31, 0, 30)],
    ...         "dt": [date(1999, 12, 31), date(2024, 8, 9)],
    ...         "tm": [time(0, 0, 0), time(23, 59, 59)],
    ...     },
    ... )

    Select all time columns:

    >>> df.select(cs.time())
    shape: (2, 1)
    ┌──────────┐
    │ tm       │
    │ ---      │
    │ time     │
    ╞══════════╡
    │ 00:00:00 │
    │ 23:59:59 │
    └──────────┘

    Select all columns *except* for those that are times:

    >>> df.select(~cs.time())
    shape: (2, 2)
    ┌─────────────────────┬────────────┐
    │ dtm                 ┆ dt         │
    │ ---                 ┆ ---        │
    │ datetime[μs]        ┆ date       │
    ╞═════════════════════╪════════════╡
    │ 2001-05-07 10:25:00 ┆ 1999-12-31 │
    │ 2031-12-31 00:30:00 ┆ 2024-08-09 │
    └─────────────────────┴────────────┘
    """
    return by_dtype([Time])