DriverTrac/venv/lib/python3.12/site-packages/polars/functions/col.py

385 lines
12 KiB
Python

from __future__ import annotations
import contextlib
import sys
from collections.abc import Iterable
from datetime import datetime, timedelta
from typing import TYPE_CHECKING
import polars._reexport as pl
from polars._utils.wrap import wrap_expr
from polars.datatypes import (
Datetime,
Duration,
is_polars_dtype,
parse_into_dtype,
)
from polars.datatypes.group import (
DATETIME_DTYPES,
DURATION_DTYPES,
FLOAT_DTYPES,
INTEGER_DTYPES,
)
with contextlib.suppress(ImportError): # Module not available when building docs
import polars._plr as plr
if TYPE_CHECKING:
from polars._typing import PolarsDataType, PythonDataType
from polars.expr.expr import Expr
if not sys.version_info >= (3, 11):
from typing import Any
__all__ = ["col"]
def _create_col(
name: (
str
| PolarsDataType
| PythonDataType
| Iterable[str]
| Iterable[PolarsDataType | PythonDataType]
),
*more_names: str | PolarsDataType | PythonDataType,
) -> Expr:
"""Create one or more column expressions representing column(s) in a DataFrame."""
dtypes: list[PolarsDataType]
if more_names:
if isinstance(name, str):
names_str = [name]
names_str.extend(more_names) # type: ignore[arg-type]
return pl.Selector._by_name(names_str, strict=True).as_expr()
elif is_polars_dtype(name):
dtypes = [name]
dtypes.extend(more_names) # type: ignore[arg-type]
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
else:
msg = (
"invalid input for `col`"
f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}."
)
raise TypeError(msg)
if isinstance(name, str):
return wrap_expr(plr.col(name))
elif is_polars_dtype(name):
dtypes = _polars_dtype_match(name)
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
elif isinstance(name, type):
dtypes = _python_dtype_match(name)
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
elif isinstance(name, Iterable):
names = list(name)
if not names:
return pl.Selector._by_name(names, strict=True).as_expr() # type: ignore[arg-type]
item = names[0]
if isinstance(item, str):
return pl.Selector._by_name(names, strict=True).as_expr() # type: ignore[arg-type]
elif is_polars_dtype(item):
dtypes = []
for nm in names:
dtypes.extend(_polars_dtype_match(nm)) # type: ignore[arg-type]
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
elif isinstance(item, type):
dtypes = []
for nm in names:
dtypes.extend(_python_dtype_match(nm)) # type: ignore[arg-type]
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
else:
msg = (
"invalid input for `col`"
"\n\nExpected iterable of type `str` or `DataType`,"
f" got iterable of type {type(item).__name__!r}."
)
raise TypeError(msg)
else:
msg = (
"invalid input for `col`"
f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}."
)
raise TypeError(msg)
def _python_dtype_match(tp: PythonDataType) -> list[PolarsDataType]:
if tp is int:
return list(INTEGER_DTYPES)
elif tp is float:
return list(FLOAT_DTYPES)
elif tp is datetime:
return list(DATETIME_DTYPES)
elif tp is timedelta:
return list(DURATION_DTYPES)
return [parse_into_dtype(tp)]
def _polars_dtype_match(tp: PolarsDataType) -> list[PolarsDataType]:
if Datetime.is_(tp):
return list(DATETIME_DTYPES)
elif Duration.is_(tp):
return list(DURATION_DTYPES)
return [tp]
class Col:
"""
Create Polars column expressions.
Notes
-----
An instance of this class is exported under the name `col`. It can be used as
though it were a function by calling, for example, `pl.col("foo")`.
See the :func:`__call__` method for further documentation.
This helper class enables an alternative syntax for creating a column expression
through attribute lookup. For example `col.foo` creates an expression equal to
`col("foo")`. See the :func:`__getattr__` method for further documentation.
The function call syntax is considered the idiomatic way of constructing a column
expression. The alternative attribute syntax can be useful for quick prototyping as
it can save some keystrokes, but has drawbacks in both expressiveness and
readability.
Examples
--------
>>> from polars import col
>>> df = pl.DataFrame(
... {
... "foo": [1, 2],
... "bar": [3, 4],
... }
... )
Create a new column expression using the standard syntax:
>>> df.with_columns(baz=(col("foo") * col("bar")) / 2)
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ baz │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 1.5 │
│ 2 ┆ 4 ┆ 4.0 │
└─────┴─────┴─────┘
Use attribute lookup to create a new column expression:
>>> df.with_columns(baz=(col.foo + col.bar))
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ baz │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 3 ┆ 4 │
│ 2 ┆ 4 ┆ 6 │
└─────┴─────┴─────┘
"""
def __call__(
self,
name: (
str
| PolarsDataType
| PythonDataType
| Iterable[str]
| Iterable[PolarsDataType | PythonDataType]
),
*more_names: str | PolarsDataType | PythonDataType,
) -> Expr:
"""
Create one or more expressions representing columns in a DataFrame.
Parameters
----------
name
The name or datatype of the column(s) to represent.
Accepts regular expression input; regular expressions
should start with `^` and end with `$`.
*more_names
Additional names or datatypes of columns to represent,
specified as positional arguments.
See Also
--------
first
last
nth
Examples
--------
Pass a single column name to represent that column.
>>> df = pl.DataFrame(
... {
... "ham": [1, 2],
... "hamburger": [11, 22],
... "foo": [2, 1],
... "bar": ["a", "b"],
... }
... )
>>> df.select(pl.col("foo"))
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 2 │
│ 1 │
└─────┘
Use dot syntax to save keystrokes for quick prototyping.
>>> from polars import col as c
>>> df.select(c.foo + c.ham)
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 3 │
│ 3 │
└─────┘
Use the wildcard `*` to represent all columns.
>>> df.select(pl.col("*"))
shape: (2, 4)
┌─────┬───────────┬─────┬─────┐
│ ham ┆ hamburger ┆ foo ┆ bar │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str │
╞═════╪═══════════╪═════╪═════╡
│ 1 ┆ 11 ┆ 2 ┆ a │
│ 2 ┆ 22 ┆ 1 ┆ b │
└─────┴───────────┴─────┴─────┘
>>> df.select(pl.col("*").exclude("ham"))
shape: (2, 3)
┌───────────┬─────┬─────┐
│ hamburger ┆ foo ┆ bar │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═══════════╪═════╪═════╡
│ 11 ┆ 2 ┆ a │
│ 22 ┆ 1 ┆ b │
└───────────┴─────┴─────┘
Regular expression input is supported.
>>> df.select(pl.col("^ham.*$"))
shape: (2, 2)
┌─────┬───────────┐
│ ham ┆ hamburger │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═══════════╡
│ 1 ┆ 11 │
│ 2 ┆ 22 │
└─────┴───────────┘
Multiple columns can be represented by passing a list of names.
>>> df.select(pl.col(["hamburger", "foo"]))
shape: (2, 2)
┌───────────┬─────┐
│ hamburger ┆ foo │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═══════════╪═════╡
│ 11 ┆ 2 │
│ 22 ┆ 1 │
└───────────┴─────┘
Or use positional arguments to represent multiple columns in the same way.
>>> df.select(pl.col("hamburger", "foo"))
shape: (2, 2)
┌───────────┬─────┐
│ hamburger ┆ foo │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═══════════╪═════╡
│ 11 ┆ 2 │
│ 22 ┆ 1 │
└───────────┴─────┘
Easily select all columns that match a certain data type by passing that
datatype.
>>> df.select(pl.col(pl.String))
shape: (2, 1)
┌─────┐
│ bar │
│ --- │
│ str │
╞═════╡
│ a │
│ b │
└─────┘
>>> df.select(pl.col(pl.Int64, pl.Float64))
shape: (2, 3)
┌─────┬───────────┬─────┐
│ ham ┆ hamburger ┆ foo │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═══════════╪═════╡
│ 1 ┆ 11 ┆ 2 │
│ 2 ┆ 22 ┆ 1 │
└─────┴───────────┴─────┘
"""
return _create_col(name, *more_names)
def __getattr__(self, name: str) -> Expr:
"""
Create a column expression using attribute syntax.
Note that this syntax does not support passing data
types or multiple column names.
Parameters
----------
name
The name of the column to represent.
Examples
--------
>>> from polars import col as c
>>> df = pl.DataFrame(
... {
... "foo": [1, 2],
... "bar": [3, 4],
... }
... )
>>> df.select(c.foo + c.bar)
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 4 │
│ 6 │
└─────┘
"""
# For autocomplete to work with IPython
if name.startswith("__wrapped__"):
return getattr(type(self), name)
return _create_col(name)
if not sys.version_info >= (3, 11):
def __getstate__(self) -> Any:
return self.__dict__
def __setstate__(self, state: Any) -> None:
self.__dict__ = state
col: Col = Col()