385 lines
12 KiB
Python
385 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import sys
|
|
from collections.abc import Iterable
|
|
from datetime import datetime, timedelta
|
|
from typing import TYPE_CHECKING
|
|
|
|
import polars._reexport as pl
|
|
from polars._utils.wrap import wrap_expr
|
|
from polars.datatypes import (
|
|
Datetime,
|
|
Duration,
|
|
is_polars_dtype,
|
|
parse_into_dtype,
|
|
)
|
|
from polars.datatypes.group import (
|
|
DATETIME_DTYPES,
|
|
DURATION_DTYPES,
|
|
FLOAT_DTYPES,
|
|
INTEGER_DTYPES,
|
|
)
|
|
|
|
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
import polars._plr as plr
|
|
|
|
if TYPE_CHECKING:
|
|
from polars._typing import PolarsDataType, PythonDataType
|
|
from polars.expr.expr import Expr
|
|
|
|
if not sys.version_info >= (3, 11):
|
|
from typing import Any
|
|
|
|
__all__ = ["col"]
|
|
|
|
|
|
def _create_col(
|
|
name: (
|
|
str
|
|
| PolarsDataType
|
|
| PythonDataType
|
|
| Iterable[str]
|
|
| Iterable[PolarsDataType | PythonDataType]
|
|
),
|
|
*more_names: str | PolarsDataType | PythonDataType,
|
|
) -> Expr:
|
|
"""Create one or more column expressions representing column(s) in a DataFrame."""
|
|
dtypes: list[PolarsDataType]
|
|
if more_names:
|
|
if isinstance(name, str):
|
|
names_str = [name]
|
|
names_str.extend(more_names) # type: ignore[arg-type]
|
|
return pl.Selector._by_name(names_str, strict=True).as_expr()
|
|
elif is_polars_dtype(name):
|
|
dtypes = [name]
|
|
dtypes.extend(more_names) # type: ignore[arg-type]
|
|
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
|
|
else:
|
|
msg = (
|
|
"invalid input for `col`"
|
|
f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}."
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
if isinstance(name, str):
|
|
return wrap_expr(plr.col(name))
|
|
elif is_polars_dtype(name):
|
|
dtypes = _polars_dtype_match(name)
|
|
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
|
|
elif isinstance(name, type):
|
|
dtypes = _python_dtype_match(name)
|
|
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
|
|
elif isinstance(name, Iterable):
|
|
names = list(name)
|
|
if not names:
|
|
return pl.Selector._by_name(names, strict=True).as_expr() # type: ignore[arg-type]
|
|
|
|
item = names[0]
|
|
if isinstance(item, str):
|
|
return pl.Selector._by_name(names, strict=True).as_expr() # type: ignore[arg-type]
|
|
elif is_polars_dtype(item):
|
|
dtypes = []
|
|
for nm in names:
|
|
dtypes.extend(_polars_dtype_match(nm)) # type: ignore[arg-type]
|
|
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
|
|
elif isinstance(item, type):
|
|
dtypes = []
|
|
for nm in names:
|
|
dtypes.extend(_python_dtype_match(nm)) # type: ignore[arg-type]
|
|
return pl.Selector._by_dtype(dtypes).as_expr() # type: ignore[arg-type]
|
|
else:
|
|
msg = (
|
|
"invalid input for `col`"
|
|
"\n\nExpected iterable of type `str` or `DataType`,"
|
|
f" got iterable of type {type(item).__name__!r}."
|
|
)
|
|
raise TypeError(msg)
|
|
else:
|
|
msg = (
|
|
"invalid input for `col`"
|
|
f"\n\nExpected `str` or `DataType`, got {type(name).__name__!r}."
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
|
|
def _python_dtype_match(tp: PythonDataType) -> list[PolarsDataType]:
|
|
if tp is int:
|
|
return list(INTEGER_DTYPES)
|
|
elif tp is float:
|
|
return list(FLOAT_DTYPES)
|
|
elif tp is datetime:
|
|
return list(DATETIME_DTYPES)
|
|
elif tp is timedelta:
|
|
return list(DURATION_DTYPES)
|
|
return [parse_into_dtype(tp)]
|
|
|
|
|
|
def _polars_dtype_match(tp: PolarsDataType) -> list[PolarsDataType]:
|
|
if Datetime.is_(tp):
|
|
return list(DATETIME_DTYPES)
|
|
elif Duration.is_(tp):
|
|
return list(DURATION_DTYPES)
|
|
return [tp]
|
|
|
|
|
|
class Col:
|
|
"""
|
|
Create Polars column expressions.
|
|
|
|
Notes
|
|
-----
|
|
An instance of this class is exported under the name `col`. It can be used as
|
|
though it were a function by calling, for example, `pl.col("foo")`.
|
|
See the :func:`__call__` method for further documentation.
|
|
|
|
This helper class enables an alternative syntax for creating a column expression
|
|
through attribute lookup. For example `col.foo` creates an expression equal to
|
|
`col("foo")`. See the :func:`__getattr__` method for further documentation.
|
|
|
|
The function call syntax is considered the idiomatic way of constructing a column
|
|
expression. The alternative attribute syntax can be useful for quick prototyping as
|
|
it can save some keystrokes, but has drawbacks in both expressiveness and
|
|
readability.
|
|
|
|
Examples
|
|
--------
|
|
>>> from polars import col
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [1, 2],
|
|
... "bar": [3, 4],
|
|
... }
|
|
... )
|
|
|
|
Create a new column expression using the standard syntax:
|
|
|
|
>>> df.with_columns(baz=(col("foo") * col("bar")) / 2)
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ foo ┆ bar ┆ baz │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ f64 │
|
|
╞═════╪═════╪═════╡
|
|
│ 1 ┆ 3 ┆ 1.5 │
|
|
│ 2 ┆ 4 ┆ 4.0 │
|
|
└─────┴─────┴─────┘
|
|
|
|
Use attribute lookup to create a new column expression:
|
|
|
|
>>> df.with_columns(baz=(col.foo + col.bar))
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ foo ┆ bar ┆ baz │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ i64 │
|
|
╞═════╪═════╪═════╡
|
|
│ 1 ┆ 3 ┆ 4 │
|
|
│ 2 ┆ 4 ┆ 6 │
|
|
└─────┴─────┴─────┘
|
|
"""
|
|
|
|
def __call__(
|
|
self,
|
|
name: (
|
|
str
|
|
| PolarsDataType
|
|
| PythonDataType
|
|
| Iterable[str]
|
|
| Iterable[PolarsDataType | PythonDataType]
|
|
),
|
|
*more_names: str | PolarsDataType | PythonDataType,
|
|
) -> Expr:
|
|
"""
|
|
Create one or more expressions representing columns in a DataFrame.
|
|
|
|
Parameters
|
|
----------
|
|
name
|
|
The name or datatype of the column(s) to represent.
|
|
Accepts regular expression input; regular expressions
|
|
should start with `^` and end with `$`.
|
|
*more_names
|
|
Additional names or datatypes of columns to represent,
|
|
specified as positional arguments.
|
|
|
|
See Also
|
|
--------
|
|
first
|
|
last
|
|
nth
|
|
|
|
Examples
|
|
--------
|
|
Pass a single column name to represent that column.
|
|
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "ham": [1, 2],
|
|
... "hamburger": [11, 22],
|
|
... "foo": [2, 1],
|
|
... "bar": ["a", "b"],
|
|
... }
|
|
... )
|
|
>>> df.select(pl.col("foo"))
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 2 │
|
|
│ 1 │
|
|
└─────┘
|
|
|
|
Use dot syntax to save keystrokes for quick prototyping.
|
|
|
|
>>> from polars import col as c
|
|
>>> df.select(c.foo + c.ham)
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 3 │
|
|
│ 3 │
|
|
└─────┘
|
|
|
|
Use the wildcard `*` to represent all columns.
|
|
|
|
>>> df.select(pl.col("*"))
|
|
shape: (2, 4)
|
|
┌─────┬───────────┬─────┬─────┐
|
|
│ ham ┆ hamburger ┆ foo ┆ bar │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ i64 ┆ str │
|
|
╞═════╪═══════════╪═════╪═════╡
|
|
│ 1 ┆ 11 ┆ 2 ┆ a │
|
|
│ 2 ┆ 22 ┆ 1 ┆ b │
|
|
└─────┴───────────┴─────┴─────┘
|
|
>>> df.select(pl.col("*").exclude("ham"))
|
|
shape: (2, 3)
|
|
┌───────────┬─────┬─────┐
|
|
│ hamburger ┆ foo ┆ bar │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ str │
|
|
╞═══════════╪═════╪═════╡
|
|
│ 11 ┆ 2 ┆ a │
|
|
│ 22 ┆ 1 ┆ b │
|
|
└───────────┴─────┴─────┘
|
|
|
|
Regular expression input is supported.
|
|
|
|
>>> df.select(pl.col("^ham.*$"))
|
|
shape: (2, 2)
|
|
┌─────┬───────────┐
|
|
│ ham ┆ hamburger │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ i64 │
|
|
╞═════╪═══════════╡
|
|
│ 1 ┆ 11 │
|
|
│ 2 ┆ 22 │
|
|
└─────┴───────────┘
|
|
|
|
Multiple columns can be represented by passing a list of names.
|
|
|
|
>>> df.select(pl.col(["hamburger", "foo"]))
|
|
shape: (2, 2)
|
|
┌───────────┬─────┐
|
|
│ hamburger ┆ foo │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ i64 │
|
|
╞═══════════╪═════╡
|
|
│ 11 ┆ 2 │
|
|
│ 22 ┆ 1 │
|
|
└───────────┴─────┘
|
|
|
|
Or use positional arguments to represent multiple columns in the same way.
|
|
|
|
>>> df.select(pl.col("hamburger", "foo"))
|
|
shape: (2, 2)
|
|
┌───────────┬─────┐
|
|
│ hamburger ┆ foo │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ i64 │
|
|
╞═══════════╪═════╡
|
|
│ 11 ┆ 2 │
|
|
│ 22 ┆ 1 │
|
|
└───────────┴─────┘
|
|
|
|
Easily select all columns that match a certain data type by passing that
|
|
datatype.
|
|
|
|
>>> df.select(pl.col(pl.String))
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ bar │
|
|
│ --- │
|
|
│ str │
|
|
╞═════╡
|
|
│ a │
|
|
│ b │
|
|
└─────┘
|
|
>>> df.select(pl.col(pl.Int64, pl.Float64))
|
|
shape: (2, 3)
|
|
┌─────┬───────────┬─────┐
|
|
│ ham ┆ hamburger ┆ foo │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ i64 │
|
|
╞═════╪═══════════╪═════╡
|
|
│ 1 ┆ 11 ┆ 2 │
|
|
│ 2 ┆ 22 ┆ 1 │
|
|
└─────┴───────────┴─────┘
|
|
"""
|
|
return _create_col(name, *more_names)
|
|
|
|
def __getattr__(self, name: str) -> Expr:
|
|
"""
|
|
Create a column expression using attribute syntax.
|
|
|
|
Note that this syntax does not support passing data
|
|
types or multiple column names.
|
|
|
|
Parameters
|
|
----------
|
|
name
|
|
The name of the column to represent.
|
|
|
|
Examples
|
|
--------
|
|
>>> from polars import col as c
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [1, 2],
|
|
... "bar": [3, 4],
|
|
... }
|
|
... )
|
|
>>> df.select(c.foo + c.bar)
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 4 │
|
|
│ 6 │
|
|
└─────┘
|
|
"""
|
|
# For autocomplete to work with IPython
|
|
if name.startswith("__wrapped__"):
|
|
return getattr(type(self), name)
|
|
|
|
return _create_col(name)
|
|
|
|
if not sys.version_info >= (3, 11):
|
|
|
|
def __getstate__(self) -> Any:
|
|
return self.__dict__
|
|
|
|
def __setstate__(self, state: Any) -> None:
|
|
self.__dict__ = state
|
|
|
|
|
|
col: Col = Col()
|