DriverTrac/venv/lib/python3.12/site-packages/polars/dataframe/group_by.py

1071 lines
38 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING, Any, Callable
from polars import functions as F
from polars._utils.convert import parse_as_duration_string
from polars._utils.deprecation import deprecated
if TYPE_CHECKING:
import sys
from collections.abc import Iterable
from datetime import timedelta
from polars import DataFrame
from polars._typing import (
ClosedInterval,
IntoExpr,
Label,
QuantileMethod,
SchemaDict,
StartBy,
)
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
if sys.version_info >= (3, 13):
from warnings import deprecated
else:
from typing_extensions import deprecated # noqa: TC004
class GroupBy:
"""Starts a new GroupBy operation."""
def __init__(
self,
df: DataFrame,
*by: IntoExpr | Iterable[IntoExpr],
maintain_order: bool,
**named_by: IntoExpr,
) -> None:
"""
Utility class for performing a group by operation over the given DataFrame.
Generated by calling `df.group_by(...)`.
Parameters
----------
df
DataFrame to perform the group by operation over.
*by
Column or columns to group by. Accepts expression input. Strings are parsed
as column names.
maintain_order
Ensure that the order of the groups is consistent with the input data.
This is slower than a default group by.
**named_by
Additional column(s) to group by, specified as keyword arguments.
The columns will be named as the keyword used.
"""
self.df = df
self.by = by
self.named_by = named_by
self.maintain_order = maintain_order
def __iter__(self) -> Self:
"""
Allows iteration over the groups of the group by operation.
Each group is represented by a tuple of `(name, data)`. The group names are
tuples of the distinct group values that identify each group.
Examples
--------
>>> df = pl.DataFrame({"foo": ["a", "a", "b"], "bar": [1, 2, 3]})
>>> for name, data in df.group_by("foo"): # doctest: +SKIP
... print(name)
... print(data)
(a,)
shape: (2, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ a ┆ 1 │
│ a ┆ 2 │
└─────┴─────┘
(b,)
shape: (1, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ b ┆ 3 │
└─────┴─────┘
"""
# Every group gather can trigger a rechunk, so do early.
from polars.lazyframe.opt_flags import QueryOptFlags
self.df = self.df.rechunk()
temp_col = "__POLARS_GB_GROUP_INDICES"
groups_df = (
self.df.lazy()
.with_row_index()
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
.agg(F.first().alias(temp_col))
.collect(optimizations=QueryOptFlags.none())
)
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
self._group_indices = groups_df.select(temp_col).to_series()
self._current_index = 0
return self
def __next__(self) -> tuple[tuple[Any, ...], DataFrame]:
if self._current_index >= len(self._group_indices):
raise StopIteration
group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1
return group_name, group_data
def agg(
self,
*aggs: IntoExpr | Iterable[IntoExpr],
**named_aggs: IntoExpr,
) -> DataFrame:
"""
Compute aggregations for each group of a group by operation.
Parameters
----------
*aggs
Aggregations to compute for each group of the group by operation,
specified as positional arguments.
Accepts expression input. Strings are parsed as column names.
**named_aggs
Additional aggregations, specified as keyword arguments.
The resulting columns will be renamed to the keyword used.
Examples
--------
Compute the aggregation of the columns for each group.
>>> df = pl.DataFrame(
... {
... "a": ["a", "b", "a", "b", "c"],
... "b": [1, 2, 1, 3, 3],
... "c": [5, 4, 3, 2, 1],
... }
... )
>>> df.group_by("a").agg(pl.col("b"), pl.col("c")) # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬───────────┬───────────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ list[i64] ┆ list[i64] │
╞═════╪═══════════╪═══════════╡
│ a ┆ [1, 1] ┆ [5, 3] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ [2, 3] ┆ [4, 2] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ [3] ┆ [1] │
└─────┴───────────┴───────────┘
Compute the sum of a column for each group.
>>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ a ┆ 2 │
│ b ┆ 5 │
│ c ┆ 3 │
└─────┴─────┘
Compute multiple aggregates at once by passing a list of expressions.
>>> df.group_by("a").agg([pl.sum("b"), pl.mean("c")]) # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═════╪═════╡
│ c ┆ 3 ┆ 1.0 │
│ a ┆ 2 ┆ 4.0 │
│ b ┆ 5 ┆ 3.0 │
└─────┴─────┴─────┘
Or use positional arguments to compute multiple aggregations in the same way.
>>> df.group_by("a").agg(
... pl.sum("b").name.suffix("_sum"),
... (pl.col("c") ** 2).mean().name.suffix("_mean_squared"),
... ) # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬───────┬────────────────┐
│ a ┆ b_sum ┆ c_mean_squared │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═══════╪════════════════╡
│ a ┆ 2 ┆ 17.0 │
│ c ┆ 3 ┆ 1.0 │
│ b ┆ 5 ┆ 10.0 │
└─────┴───────┴────────────────┘
Use keyword arguments to easily name your expression inputs.
>>> df.group_by("a").agg(
... b_sum=pl.sum("b"),
... c_mean_squared=(pl.col("c") ** 2).mean(),
... ) # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬───────┬────────────────┐
│ a ┆ b_sum ┆ c_mean_squared │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═══════╪════════════════╡
│ a ┆ 2 ┆ 17.0 │
│ c ┆ 3 ┆ 1.0 │
│ b ┆ 5 ┆ 10.0 │
└─────┴───────┴────────────────┘
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
.agg(*aggs, **named_aggs)
.collect(optimizations=QueryOptFlags.none())
)
def map_groups(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame:
"""
Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
.. warning::
This method is much slower than the native expressions API.
Only use it if you cannot implement your logic otherwise.
Implementing logic using a Python function is almost always *significantly*
slower and more memory intensive than implementing the same logic using
the native expression API because:
- The native expression engine runs in Rust; UDFs run in Python.
- Use of Python UDFs forces the DataFrame to be materialized in memory.
- Polars-native expressions can be parallelised (UDFs cannot).
- Polars-native expressions can be logically optimised (UDFs cannot).
Wherever possible you should strongly prefer the native expression API
to achieve the best performance.
Parameters
----------
function
Custom function that receives a DataFrame and returns a DataFrame.
Returns
-------
DataFrame
Examples
--------
For each color group sample two rows:
>>> df = pl.DataFrame(
... {
... "id": [0, 1, 2, 3, 4],
... "color": ["red", "green", "green", "red", "red"],
... "shape": ["square", "triangle", "square", "triangle", "square"],
... }
... )
>>> df.group_by("color").map_groups(
... lambda group_df: group_df.sample(2)
... ) # doctest: +IGNORE_RESULT
shape: (4, 3)
┌─────┬───────┬──────────┐
│ id ┆ color ┆ shape │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═══════╪══════════╡
│ 1 ┆ green ┆ triangle │
│ 2 ┆ green ┆ square │
│ 4 ┆ red ┆ square │
│ 3 ┆ red ┆ triangle │
└─────┴───────┴──────────┘
It is better to implement this with an expression:
>>> df.filter(
... pl.int_range(pl.len()).shuffle().over("color") < 2
... ) # doctest: +IGNORE_RESULT
"""
if self.named_by:
msg = "cannot call `map_groups` when grouping by named expressions"
raise TypeError(msg)
if not all(isinstance(c, str) for c in self.by):
msg = "cannot call `map_groups` when grouping by an expression"
raise TypeError(msg)
by_strs: list[str] = self.by # type: ignore[assignment]
return self.df.__class__._from_pydf(
self.df._df.group_by_map_groups(by_strs, function, self.maintain_order)
)
def head(self, n: int = 5) -> DataFrame:
"""
Get the first `n` rows of each group.
Parameters
----------
n
Number of rows to return.
Examples
--------
>>> df = pl.DataFrame(
... {
... "letters": ["c", "c", "a", "c", "a", "b"],
... "nrs": [1, 2, 3, 4, 5, 6],
... }
... )
>>> df
shape: (6, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ c ┆ 1 │
│ c ┆ 2 │
│ a ┆ 3 │
│ c ┆ 4 │
│ a ┆ 5 │
│ b ┆ 6 │
└─────────┴─────┘
>>> df.group_by("letters").head(2).sort("letters")
shape: (5, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ a ┆ 3 │
│ a ┆ 5 │
│ b ┆ 6 │
│ c ┆ 1 │
│ c ┆ 2 │
└─────────┴─────┘
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
.head(n)
.collect(optimizations=QueryOptFlags._eager())
)
def tail(self, n: int = 5) -> DataFrame:
"""
Get the last `n` rows of each group.
Parameters
----------
n
Number of rows to return.
Examples
--------
>>> df = pl.DataFrame(
... {
... "letters": ["c", "c", "a", "c", "a", "b"],
... "nrs": [1, 2, 3, 4, 5, 6],
... }
... )
>>> df
shape: (6, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ c ┆ 1 │
│ c ┆ 2 │
│ a ┆ 3 │
│ c ┆ 4 │
│ a ┆ 5 │
│ b ┆ 6 │
└─────────┴─────┘
>>> df.group_by("letters").tail(2).sort("letters")
shape: (5, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ a ┆ 3 │
│ a ┆ 5 │
│ b ┆ 6 │
│ c ┆ 2 │
│ c ┆ 4 │
└─────────┴─────┘
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
.tail(n)
.collect(optimizations=QueryOptFlags.none())
)
def all(self) -> DataFrame:
"""
Aggregate the groups into Series.
Examples
--------
>>> df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
>>> df.group_by("a", maintain_order=True).all()
shape: (2, 2)
┌─────┬───────────┐
│ a ┆ b │
│ --- ┆ --- │
│ str ┆ list[i64] │
╞═════╪═══════════╡
│ one ┆ [1, 3] │
│ two ┆ [2, 4] │
└─────┴───────────┘
"""
return self.agg(F.all())
def len(self, name: str | None = None) -> DataFrame:
"""
Return the number of rows in each group.
Parameters
----------
name
Assign a name to the resulting column; if unset, defaults to "len".
Examples
--------
>>> df = pl.DataFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
>>> df.group_by("a").len() # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
>>> df.group_by("a").len(name="n") # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ n │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
"""
len_expr = F.len()
if name is not None:
len_expr = len_expr.alias(name)
return self.agg(len_expr)
@deprecated("`GroupBy.count` was renamed; use `GroupBy.len` instead")
def count(self) -> DataFrame:
"""
Return the number of rows in each group.
.. deprecated:: 0.20.5
This method has been renamed to :func:`GroupBy.len`.
Rows containing null values count towards the total.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": ["Apple", "Apple", "Orange"],
... "b": [1, None, 2],
... }
... )
>>> df.group_by("a").count() # doctest: +SKIP
shape: (2, 2)
┌────────┬───────┐
│ a ┆ count │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═══════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴───────┘
"""
return self.agg(F.len().alias("count"))
def first(self) -> DataFrame:
"""
Aggregate the first values in the group.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).first()
shape: (3, 4)
┌────────┬─────┬──────┬───────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪═══════╡
│ Apple ┆ 1 ┆ 0.5 ┆ true │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 4 ┆ 13.0 ┆ false │
└────────┴─────┴──────┴───────┘
"""
return self.agg(F.all().first())
def last(self) -> DataFrame:
"""
Aggregate the last values in the group.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 14, 13],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).last()
shape: (3, 4)
┌────────┬─────┬──────┬───────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪═══════╡
│ Apple ┆ 3 ┆ 10.0 ┆ false │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 5 ┆ 13.0 ┆ true │
└────────┴─────┴──────┴───────┘
"""
return self.agg(F.all().last())
def max(self) -> DataFrame:
"""
Reduce the groups to the maximal value.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).max()
shape: (3, 4)
┌────────┬─────┬──────┬──────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪══════╡
│ Apple ┆ 3 ┆ 10.0 ┆ true │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 5 ┆ 14.0 ┆ true │
└────────┴─────┴──────┴──────┘
"""
return self.agg(F.all().max())
def mean(self) -> DataFrame:
"""
Reduce the groups to the mean values.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).mean()
shape: (3, 4)
┌────────┬─────┬──────────┬──────────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞════════╪═════╪══════════╪══════════╡
│ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
│ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
│ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
└────────┴─────┴──────────┴──────────┘
"""
return self.agg(F.all().mean())
def median(self) -> DataFrame:
"""
Return the median per group.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).median()
shape: (2, 3)
┌────────┬─────┬──────┐
│ d ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞════════╪═════╪══════╡
│ Apple ┆ 2.0 ┆ 4.0 │
│ Banana ┆ 4.0 ┆ 13.0 │
└────────┴─────┴──────┘
"""
return self.agg(F.all().median())
def min(self) -> DataFrame:
"""
Reduce the groups to the minimal value.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).min()
shape: (3, 4)
┌────────┬─────┬──────┬───────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪═══════╡
│ Apple ┆ 1 ┆ 0.5 ┆ false │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 4 ┆ 13.0 ┆ false │
└────────┴─────┴──────┴───────┘
"""
return self.agg(F.all().min())
def n_unique(self) -> DataFrame:
"""
Count the unique values per group.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 1, 3, 4, 5],
... "b": [0.5, 0.5, 0.5, 10, 13, 14],
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).n_unique()
shape: (2, 3)
┌────────┬─────┬─────┐
│ d ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞════════╪═════╪═════╡
│ Apple ┆ 2 ┆ 2 │
│ Banana ┆ 3 ┆ 3 │
└────────┴─────┴─────┘
"""
return self.agg(F.all().n_unique())
def quantile(
self, quantile: float, interpolation: QuantileMethod = "nearest"
) -> DataFrame:
"""
Compute the quantile per group.
Parameters
----------
quantile
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
Interpolation method.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).quantile(1)
shape: (3, 3)
┌────────┬─────┬──────┐
│ d ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞════════╪═════╪══════╡
│ Apple ┆ 3.0 ┆ 10.0 │
│ Orange ┆ 2.0 ┆ 0.5 │
│ Banana ┆ 5.0 ┆ 14.0 │
└────────┴─────┴──────┘
""" # noqa: W505
return self.agg(F.all().quantile(quantile, interpolation=interpolation))
def sum(self) -> DataFrame:
"""
Reduce the groups to the sum.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... )
>>> df.group_by("d", maintain_order=True).sum()
shape: (3, 4)
┌────────┬─────┬──────┬─────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ u32 │
╞════════╪═════╪══════╪═════╡
│ Apple ┆ 6 ┆ 14.5 ┆ 2 │
│ Orange ┆ 2 ┆ 0.5 ┆ 1 │
│ Banana ┆ 9 ┆ 27.0 ┆ 1 │
└────────┴─────┴──────┴─────┘
"""
return self.agg(F.all().sum())
class RollingGroupBy:
"""
A rolling grouper.
This has an `.agg` method which will allow you to run all polars expressions in a
group by context.
"""
def __init__(
self,
df: DataFrame,
index_column: IntoExpr,
*,
period: str | timedelta,
offset: str | timedelta | None,
closed: ClosedInterval,
group_by: IntoExpr | Iterable[IntoExpr] | None,
) -> None:
period = parse_as_duration_string(period)
offset = parse_as_duration_string(offset)
self.df = df
self.time_column = index_column
self.period = period
self.offset = offset
self.closed = closed
self.group_by = group_by
def __iter__(self) -> Self:
from polars.lazyframe.opt_flags import QueryOptFlags
temp_col = "__POLARS_GB_GROUP_INDICES"
groups_df = (
self.df.lazy()
.with_row_index()
.rolling(
index_column=self.time_column,
period=self.period,
offset=self.offset,
closed=self.closed,
group_by=self.group_by,
)
.agg(F.first().alias(temp_col))
.collect(optimizations=QueryOptFlags.none())
)
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
self._group_indices = groups_df.select(temp_col).to_series()
self._current_index = 0
return self
def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
if self._current_index >= len(self._group_indices):
raise StopIteration
group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1
return group_name, group_data
def agg(
self,
*aggs: IntoExpr | Iterable[IntoExpr],
**named_aggs: IntoExpr,
) -> DataFrame:
"""
Compute aggregations for each group of a group by operation.
Parameters
----------
*aggs
Aggregations to compute for each group of the group by operation,
specified as positional arguments.
Accepts expression input. Strings are parsed as column names.
**named_aggs
Additional aggregations, specified as keyword arguments.
The resulting columns will be renamed to the keyword used.
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.rolling(
index_column=self.time_column,
period=self.period,
offset=self.offset,
closed=self.closed,
group_by=self.group_by,
)
.agg(*aggs, **named_aggs)
.collect(optimizations=QueryOptFlags.none())
)
def map_groups(
self,
function: Callable[[DataFrame], DataFrame],
schema: SchemaDict | None,
) -> DataFrame:
"""
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
Using this is considered an anti-pattern as it will be very slow because:
- it forces the engine to materialize the whole `DataFrames` for the groups.
- it is not parallelized.
- it blocks optimizations as the passed python function is opaque to the
optimizer.
The idiomatic way to apply custom functions over multiple columns is using:
`pl.struct([my_columns]).map_elements(lambda struct_series: ..)`
Parameters
----------
function
Function to apply over each group of the `LazyFrame`; it receives
a DataFrame and should return a DataFrame.
schema
Schema of the output function. This has to be known statically. If the
given schema is incorrect, this is a bug in the caller's query and may
lead to errors. If set to None, polars assumes the schema is unchanged.
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.rolling(
index_column=self.time_column,
period=self.period,
offset=self.offset,
closed=self.closed,
group_by=self.group_by,
)
.map_groups(function, schema)
.collect(optimizations=QueryOptFlags.none())
)
class DynamicGroupBy:
"""
A dynamic grouper.
This has an `.agg` method which allows you to run all polars expressions in a
group by context.
"""
def __init__(
self,
df: DataFrame,
index_column: IntoExpr,
*,
every: str | timedelta,
period: str | timedelta | None,
offset: str | timedelta | None,
include_boundaries: bool,
closed: ClosedInterval,
label: Label,
group_by: IntoExpr | Iterable[IntoExpr] | None,
start_by: StartBy,
) -> None:
every = parse_as_duration_string(every)
period = parse_as_duration_string(period)
offset = parse_as_duration_string(offset)
self.df = df
self.time_column = index_column
self.every = every
self.period = period
self.offset = offset
self.label = label
self.include_boundaries = include_boundaries
self.closed = closed
self.group_by = group_by
self.start_by = start_by
def __iter__(self) -> Self:
from polars.lazyframe.opt_flags import QueryOptFlags
temp_col = "__POLARS_GB_GROUP_INDICES"
groups_df = (
self.df.lazy()
.with_row_index()
.group_by_dynamic(
index_column=self.time_column,
every=self.every,
period=self.period,
offset=self.offset,
label=self.label,
include_boundaries=self.include_boundaries,
closed=self.closed,
group_by=self.group_by,
start_by=self.start_by,
)
.agg(F.first().alias(temp_col))
.collect(optimizations=QueryOptFlags.none())
)
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
self._group_indices = groups_df.select(temp_col).to_series()
self._current_index = 0
return self
def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
if self._current_index >= len(self._group_indices):
raise StopIteration
group_name = next(self._group_names)
group_data = self.df[self._group_indices[self._current_index], :]
self._current_index += 1
return group_name, group_data
def agg(
self,
*aggs: IntoExpr | Iterable[IntoExpr],
**named_aggs: IntoExpr,
) -> DataFrame:
"""
Compute aggregations for each group of a group by operation.
Parameters
----------
*aggs
Aggregations to compute for each group of the group by operation,
specified as positional arguments.
Accepts expression input. Strings are parsed as column names.
**named_aggs
Additional aggregations, specified as keyword arguments.
The resulting columns will be renamed to the keyword used.
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.group_by_dynamic(
index_column=self.time_column,
every=self.every,
period=self.period,
offset=self.offset,
label=self.label,
include_boundaries=self.include_boundaries,
closed=self.closed,
group_by=self.group_by,
start_by=self.start_by,
)
.agg(*aggs, **named_aggs)
.collect(optimizations=QueryOptFlags.none())
)
def map_groups(
self,
function: Callable[[DataFrame], DataFrame],
schema: SchemaDict | None,
) -> DataFrame:
"""
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
Using this is considered an anti-pattern as it will be very slow because:
- it forces the engine to materialize the whole `DataFrames` for the groups.
- it is not parallelized.
- it blocks optimizations as the passed python function is opaque to the
optimizer.
The idiomatic way to apply custom functions over multiple columns is using:
`pl.struct([my_columns]).map_elements(lambda struct_series: ..)`
Parameters
----------
function
Function to apply over each group of the `LazyFrame`; it receives
a DataFrame and should return a DataFrame.
schema
Schema of the output function. This has to be known statically. If the
given schema is incorrect, this is a bug in the caller's query and may
lead to errors. If set to None, polars assumes the schema is unchanged.
"""
from polars.lazyframe.opt_flags import QueryOptFlags
return (
self.df.lazy()
.group_by_dynamic(
index_column=self.time_column,
every=self.every,
period=self.period,
offset=self.offset,
include_boundaries=self.include_boundaries,
closed=self.closed,
group_by=self.group_by,
start_by=self.start_by,
)
.map_groups(function, schema)
.collect(optimizations=QueryOptFlags.none())
)