1071 lines
38 KiB
Python
1071 lines
38 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, Any, Callable
|
|
|
|
from polars import functions as F
|
|
from polars._utils.convert import parse_as_duration_string
|
|
from polars._utils.deprecation import deprecated
|
|
|
|
if TYPE_CHECKING:
|
|
import sys
|
|
from collections.abc import Iterable
|
|
from datetime import timedelta
|
|
|
|
from polars import DataFrame
|
|
from polars._typing import (
|
|
ClosedInterval,
|
|
IntoExpr,
|
|
Label,
|
|
QuantileMethod,
|
|
SchemaDict,
|
|
StartBy,
|
|
)
|
|
|
|
if sys.version_info >= (3, 11):
|
|
from typing import Self
|
|
else:
|
|
from typing_extensions import Self
|
|
|
|
if sys.version_info >= (3, 13):
|
|
from warnings import deprecated
|
|
else:
|
|
from typing_extensions import deprecated # noqa: TC004
|
|
|
|
|
|
class GroupBy:
|
|
"""Starts a new GroupBy operation."""
|
|
|
|
def __init__(
|
|
self,
|
|
df: DataFrame,
|
|
*by: IntoExpr | Iterable[IntoExpr],
|
|
maintain_order: bool,
|
|
**named_by: IntoExpr,
|
|
) -> None:
|
|
"""
|
|
Utility class for performing a group by operation over the given DataFrame.
|
|
|
|
Generated by calling `df.group_by(...)`.
|
|
|
|
Parameters
|
|
----------
|
|
df
|
|
DataFrame to perform the group by operation over.
|
|
*by
|
|
Column or columns to group by. Accepts expression input. Strings are parsed
|
|
as column names.
|
|
maintain_order
|
|
Ensure that the order of the groups is consistent with the input data.
|
|
This is slower than a default group by.
|
|
**named_by
|
|
Additional column(s) to group by, specified as keyword arguments.
|
|
The columns will be named as the keyword used.
|
|
"""
|
|
self.df = df
|
|
self.by = by
|
|
self.named_by = named_by
|
|
self.maintain_order = maintain_order
|
|
|
|
def __iter__(self) -> Self:
|
|
"""
|
|
Allows iteration over the groups of the group by operation.
|
|
|
|
Each group is represented by a tuple of `(name, data)`. The group names are
|
|
tuples of the distinct group values that identify each group.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"foo": ["a", "a", "b"], "bar": [1, 2, 3]})
|
|
>>> for name, data in df.group_by("foo"): # doctest: +SKIP
|
|
... print(name)
|
|
... print(data)
|
|
(a,)
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ bar │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ a ┆ 1 │
|
|
│ a ┆ 2 │
|
|
└─────┴─────┘
|
|
(b,)
|
|
shape: (1, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ bar │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ b ┆ 3 │
|
|
└─────┴─────┘
|
|
"""
|
|
# Every group gather can trigger a rechunk, so do early.
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
self.df = self.df.rechunk()
|
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
|
groups_df = (
|
|
self.df.lazy()
|
|
.with_row_index()
|
|
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
.agg(F.first().alias(temp_col))
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
|
|
self._group_indices = groups_df.select(temp_col).to_series()
|
|
self._current_index = 0
|
|
|
|
return self
|
|
|
|
def __next__(self) -> tuple[tuple[Any, ...], DataFrame]:
|
|
if self._current_index >= len(self._group_indices):
|
|
raise StopIteration
|
|
|
|
group_name = next(self._group_names)
|
|
group_data = self.df[self._group_indices[self._current_index], :]
|
|
self._current_index += 1
|
|
|
|
return group_name, group_data
|
|
|
|
def agg(
|
|
self,
|
|
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
**named_aggs: IntoExpr,
|
|
) -> DataFrame:
|
|
"""
|
|
Compute aggregations for each group of a group by operation.
|
|
|
|
Parameters
|
|
----------
|
|
*aggs
|
|
Aggregations to compute for each group of the group by operation,
|
|
specified as positional arguments.
|
|
Accepts expression input. Strings are parsed as column names.
|
|
**named_aggs
|
|
Additional aggregations, specified as keyword arguments.
|
|
The resulting columns will be renamed to the keyword used.
|
|
|
|
Examples
|
|
--------
|
|
Compute the aggregation of the columns for each group.
|
|
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": ["a", "b", "a", "b", "c"],
|
|
... "b": [1, 2, 1, 3, 3],
|
|
... "c": [5, 4, 3, 2, 1],
|
|
... }
|
|
... )
|
|
>>> df.group_by("a").agg(pl.col("b"), pl.col("c")) # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬───────────┬───────────┐
|
|
│ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ list[i64] ┆ list[i64] │
|
|
╞═════╪═══════════╪═══════════╡
|
|
│ a ┆ [1, 1] ┆ [5, 3] │
|
|
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
|
│ b ┆ [2, 3] ┆ [4, 2] │
|
|
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
|
│ c ┆ [3] ┆ [1] │
|
|
└─────┴───────────┴───────────┘
|
|
|
|
Compute the sum of a column for each group.
|
|
|
|
>>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT
|
|
shape: (3, 2)
|
|
┌─────┬─────┐
|
|
│ a ┆ b │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ a ┆ 2 │
|
|
│ b ┆ 5 │
|
|
│ c ┆ 3 │
|
|
└─────┴─────┘
|
|
|
|
Compute multiple aggregates at once by passing a list of expressions.
|
|
|
|
>>> df.group_by("a").agg([pl.sum("b"), pl.mean("c")]) # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═════╪═════╡
|
|
│ c ┆ 3 ┆ 1.0 │
|
|
│ a ┆ 2 ┆ 4.0 │
|
|
│ b ┆ 5 ┆ 3.0 │
|
|
└─────┴─────┴─────┘
|
|
|
|
Or use positional arguments to compute multiple aggregations in the same way.
|
|
|
|
>>> df.group_by("a").agg(
|
|
... pl.sum("b").name.suffix("_sum"),
|
|
... (pl.col("c") ** 2).mean().name.suffix("_mean_squared"),
|
|
... ) # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬───────┬────────────────┐
|
|
│ a ┆ b_sum ┆ c_mean_squared │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═══════╪════════════════╡
|
|
│ a ┆ 2 ┆ 17.0 │
|
|
│ c ┆ 3 ┆ 1.0 │
|
|
│ b ┆ 5 ┆ 10.0 │
|
|
└─────┴───────┴────────────────┘
|
|
|
|
Use keyword arguments to easily name your expression inputs.
|
|
|
|
>>> df.group_by("a").agg(
|
|
... b_sum=pl.sum("b"),
|
|
... c_mean_squared=(pl.col("c") ** 2).mean(),
|
|
... ) # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬───────┬────────────────┐
|
|
│ a ┆ b_sum ┆ c_mean_squared │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═══════╪════════════════╡
|
|
│ a ┆ 2 ┆ 17.0 │
|
|
│ c ┆ 3 ┆ 1.0 │
|
|
│ b ┆ 5 ┆ 10.0 │
|
|
└─────┴───────┴────────────────┘
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
.agg(*aggs, **named_aggs)
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
def map_groups(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame:
|
|
"""
|
|
Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
|
|
|
|
.. warning::
|
|
This method is much slower than the native expressions API.
|
|
Only use it if you cannot implement your logic otherwise.
|
|
|
|
Implementing logic using a Python function is almost always *significantly*
|
|
slower and more memory intensive than implementing the same logic using
|
|
the native expression API because:
|
|
|
|
- The native expression engine runs in Rust; UDFs run in Python.
|
|
- Use of Python UDFs forces the DataFrame to be materialized in memory.
|
|
- Polars-native expressions can be parallelised (UDFs cannot).
|
|
- Polars-native expressions can be logically optimised (UDFs cannot).
|
|
|
|
Wherever possible you should strongly prefer the native expression API
|
|
to achieve the best performance.
|
|
|
|
Parameters
|
|
----------
|
|
function
|
|
Custom function that receives a DataFrame and returns a DataFrame.
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
|
|
Examples
|
|
--------
|
|
For each color group sample two rows:
|
|
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "id": [0, 1, 2, 3, 4],
|
|
... "color": ["red", "green", "green", "red", "red"],
|
|
... "shape": ["square", "triangle", "square", "triangle", "square"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("color").map_groups(
|
|
... lambda group_df: group_df.sample(2)
|
|
... ) # doctest: +IGNORE_RESULT
|
|
shape: (4, 3)
|
|
┌─────┬───────┬──────────┐
|
|
│ id ┆ color ┆ shape │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ str ┆ str │
|
|
╞═════╪═══════╪══════════╡
|
|
│ 1 ┆ green ┆ triangle │
|
|
│ 2 ┆ green ┆ square │
|
|
│ 4 ┆ red ┆ square │
|
|
│ 3 ┆ red ┆ triangle │
|
|
└─────┴───────┴──────────┘
|
|
|
|
It is better to implement this with an expression:
|
|
|
|
>>> df.filter(
|
|
... pl.int_range(pl.len()).shuffle().over("color") < 2
|
|
... ) # doctest: +IGNORE_RESULT
|
|
"""
|
|
if self.named_by:
|
|
msg = "cannot call `map_groups` when grouping by named expressions"
|
|
raise TypeError(msg)
|
|
if not all(isinstance(c, str) for c in self.by):
|
|
msg = "cannot call `map_groups` when grouping by an expression"
|
|
raise TypeError(msg)
|
|
|
|
by_strs: list[str] = self.by # type: ignore[assignment]
|
|
|
|
return self.df.__class__._from_pydf(
|
|
self.df._df.group_by_map_groups(by_strs, function, self.maintain_order)
|
|
)
|
|
|
|
def head(self, n: int = 5) -> DataFrame:
|
|
"""
|
|
Get the first `n` rows of each group.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of rows to return.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "letters": ["c", "c", "a", "c", "a", "b"],
|
|
... "nrs": [1, 2, 3, 4, 5, 6],
|
|
... }
|
|
... )
|
|
>>> df
|
|
shape: (6, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ c ┆ 1 │
|
|
│ c ┆ 2 │
|
|
│ a ┆ 3 │
|
|
│ c ┆ 4 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
└─────────┴─────┘
|
|
>>> df.group_by("letters").head(2).sort("letters")
|
|
shape: (5, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ a ┆ 3 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
│ c ┆ 1 │
|
|
│ c ┆ 2 │
|
|
└─────────┴─────┘
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
.head(n)
|
|
.collect(optimizations=QueryOptFlags._eager())
|
|
)
|
|
|
|
def tail(self, n: int = 5) -> DataFrame:
|
|
"""
|
|
Get the last `n` rows of each group.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of rows to return.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "letters": ["c", "c", "a", "c", "a", "b"],
|
|
... "nrs": [1, 2, 3, 4, 5, 6],
|
|
... }
|
|
... )
|
|
>>> df
|
|
shape: (6, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ c ┆ 1 │
|
|
│ c ┆ 2 │
|
|
│ a ┆ 3 │
|
|
│ c ┆ 4 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
└─────────┴─────┘
|
|
>>> df.group_by("letters").tail(2).sort("letters")
|
|
shape: (5, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ a ┆ 3 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
│ c ┆ 2 │
|
|
│ c ┆ 4 │
|
|
└─────────┴─────┘
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.group_by(*self.by, **self.named_by, maintain_order=self.maintain_order)
|
|
.tail(n)
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
def all(self) -> DataFrame:
|
|
"""
|
|
Aggregate the groups into Series.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
|
|
>>> df.group_by("a", maintain_order=True).all()
|
|
shape: (2, 2)
|
|
┌─────┬───────────┐
|
|
│ a ┆ b │
|
|
│ --- ┆ --- │
|
|
│ str ┆ list[i64] │
|
|
╞═════╪═══════════╡
|
|
│ one ┆ [1, 3] │
|
|
│ two ┆ [2, 4] │
|
|
└─────┴───────────┘
|
|
"""
|
|
return self.agg(F.all())
|
|
|
|
def len(self, name: str | None = None) -> DataFrame:
|
|
"""
|
|
Return the number of rows in each group.
|
|
|
|
Parameters
|
|
----------
|
|
name
|
|
Assign a name to the resulting column; if unset, defaults to "len".
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
|
|
>>> df.group_by("a").len() # doctest: +IGNORE_RESULT
|
|
shape: (2, 2)
|
|
┌────────┬─────┐
|
|
│ a ┆ len │
|
|
│ --- ┆ --- │
|
|
│ str ┆ u32 │
|
|
╞════════╪═════╡
|
|
│ Apple ┆ 2 │
|
|
│ Orange ┆ 1 │
|
|
└────────┴─────┘
|
|
>>> df.group_by("a").len(name="n") # doctest: +IGNORE_RESULT
|
|
shape: (2, 2)
|
|
┌────────┬─────┐
|
|
│ a ┆ n │
|
|
│ --- ┆ --- │
|
|
│ str ┆ u32 │
|
|
╞════════╪═════╡
|
|
│ Apple ┆ 2 │
|
|
│ Orange ┆ 1 │
|
|
└────────┴─────┘
|
|
"""
|
|
len_expr = F.len()
|
|
if name is not None:
|
|
len_expr = len_expr.alias(name)
|
|
return self.agg(len_expr)
|
|
|
|
@deprecated("`GroupBy.count` was renamed; use `GroupBy.len` instead")
|
|
def count(self) -> DataFrame:
|
|
"""
|
|
Return the number of rows in each group.
|
|
|
|
.. deprecated:: 0.20.5
|
|
This method has been renamed to :func:`GroupBy.len`.
|
|
|
|
Rows containing null values count towards the total.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": ["Apple", "Apple", "Orange"],
|
|
... "b": [1, None, 2],
|
|
... }
|
|
... )
|
|
>>> df.group_by("a").count() # doctest: +SKIP
|
|
shape: (2, 2)
|
|
┌────────┬───────┐
|
|
│ a ┆ count │
|
|
│ --- ┆ --- │
|
|
│ str ┆ u32 │
|
|
╞════════╪═══════╡
|
|
│ Apple ┆ 2 │
|
|
│ Orange ┆ 1 │
|
|
└────────┴───────┘
|
|
"""
|
|
return self.agg(F.len().alias("count"))
|
|
|
|
def first(self) -> DataFrame:
|
|
"""
|
|
Aggregate the first values in the group.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).first()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬───────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪═══════╡
|
|
│ Apple ┆ 1 ┆ 0.5 ┆ true │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 4 ┆ 13.0 ┆ false │
|
|
└────────┴─────┴──────┴───────┘
|
|
"""
|
|
return self.agg(F.all().first())
|
|
|
|
def last(self) -> DataFrame:
|
|
"""
|
|
Aggregate the last values in the group.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 14, 13],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).last()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬───────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪═══════╡
|
|
│ Apple ┆ 3 ┆ 10.0 ┆ false │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 5 ┆ 13.0 ┆ true │
|
|
└────────┴─────┴──────┴───────┘
|
|
"""
|
|
return self.agg(F.all().last())
|
|
|
|
def max(self) -> DataFrame:
|
|
"""
|
|
Reduce the groups to the maximal value.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).max()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬──────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪══════╡
|
|
│ Apple ┆ 3 ┆ 10.0 ┆ true │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 5 ┆ 14.0 ┆ true │
|
|
└────────┴─────┴──────┴──────┘
|
|
"""
|
|
return self.agg(F.all().max())
|
|
|
|
def mean(self) -> DataFrame:
|
|
"""
|
|
Reduce the groups to the mean values.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).mean()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────────┬──────────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 │
|
|
╞════════╪═════╪══════════╪══════════╡
|
|
│ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
|
|
│ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
|
|
│ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
|
|
└────────┴─────┴──────────┴──────────┘
|
|
"""
|
|
return self.agg(F.all().mean())
|
|
|
|
def median(self) -> DataFrame:
|
|
"""
|
|
Return the median per group.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).median()
|
|
shape: (2, 3)
|
|
┌────────┬─────┬──────┐
|
|
│ d ┆ a ┆ b │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 │
|
|
╞════════╪═════╪══════╡
|
|
│ Apple ┆ 2.0 ┆ 4.0 │
|
|
│ Banana ┆ 4.0 ┆ 13.0 │
|
|
└────────┴─────┴──────┘
|
|
"""
|
|
return self.agg(F.all().median())
|
|
|
|
def min(self) -> DataFrame:
|
|
"""
|
|
Reduce the groups to the minimal value.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).min()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬───────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪═══════╡
|
|
│ Apple ┆ 1 ┆ 0.5 ┆ false │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 4 ┆ 13.0 ┆ false │
|
|
└────────┴─────┴──────┴───────┘
|
|
"""
|
|
return self.agg(F.all().min())
|
|
|
|
def n_unique(self) -> DataFrame:
|
|
"""
|
|
Count the unique values per group.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 1, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 0.5, 10, 13, 14],
|
|
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).n_unique()
|
|
shape: (2, 3)
|
|
┌────────┬─────┬─────┐
|
|
│ d ┆ a ┆ b │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ u32 ┆ u32 │
|
|
╞════════╪═════╪═════╡
|
|
│ Apple ┆ 2 ┆ 2 │
|
|
│ Banana ┆ 3 ┆ 3 │
|
|
└────────┴─────┴─────┘
|
|
"""
|
|
return self.agg(F.all().n_unique())
|
|
|
|
def quantile(
|
|
self, quantile: float, interpolation: QuantileMethod = "nearest"
|
|
) -> DataFrame:
|
|
"""
|
|
Compute the quantile per group.
|
|
|
|
Parameters
|
|
----------
|
|
quantile
|
|
Quantile between 0.0 and 1.0.
|
|
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
|
|
Interpolation method.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).quantile(1)
|
|
shape: (3, 3)
|
|
┌────────┬─────┬──────┐
|
|
│ d ┆ a ┆ b │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 │
|
|
╞════════╪═════╪══════╡
|
|
│ Apple ┆ 3.0 ┆ 10.0 │
|
|
│ Orange ┆ 2.0 ┆ 0.5 │
|
|
│ Banana ┆ 5.0 ┆ 14.0 │
|
|
└────────┴─────┴──────┘
|
|
""" # noqa: W505
|
|
return self.agg(F.all().quantile(quantile, interpolation=interpolation))
|
|
|
|
def sum(self) -> DataFrame:
|
|
"""
|
|
Reduce the groups to the sum.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... )
|
|
>>> df.group_by("d", maintain_order=True).sum()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬─────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ u32 │
|
|
╞════════╪═════╪══════╪═════╡
|
|
│ Apple ┆ 6 ┆ 14.5 ┆ 2 │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ 1 │
|
|
│ Banana ┆ 9 ┆ 27.0 ┆ 1 │
|
|
└────────┴─────┴──────┴─────┘
|
|
"""
|
|
return self.agg(F.all().sum())
|
|
|
|
|
|
class RollingGroupBy:
|
|
"""
|
|
A rolling grouper.
|
|
|
|
This has an `.agg` method which will allow you to run all polars expressions in a
|
|
group by context.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
df: DataFrame,
|
|
index_column: IntoExpr,
|
|
*,
|
|
period: str | timedelta,
|
|
offset: str | timedelta | None,
|
|
closed: ClosedInterval,
|
|
group_by: IntoExpr | Iterable[IntoExpr] | None,
|
|
) -> None:
|
|
period = parse_as_duration_string(period)
|
|
offset = parse_as_duration_string(offset)
|
|
|
|
self.df = df
|
|
self.time_column = index_column
|
|
self.period = period
|
|
self.offset = offset
|
|
self.closed = closed
|
|
self.group_by = group_by
|
|
|
|
def __iter__(self) -> Self:
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
|
groups_df = (
|
|
self.df.lazy()
|
|
.with_row_index()
|
|
.rolling(
|
|
index_column=self.time_column,
|
|
period=self.period,
|
|
offset=self.offset,
|
|
closed=self.closed,
|
|
group_by=self.group_by,
|
|
)
|
|
.agg(F.first().alias(temp_col))
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
|
|
self._group_indices = groups_df.select(temp_col).to_series()
|
|
self._current_index = 0
|
|
|
|
return self
|
|
|
|
def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
|
|
if self._current_index >= len(self._group_indices):
|
|
raise StopIteration
|
|
|
|
group_name = next(self._group_names)
|
|
group_data = self.df[self._group_indices[self._current_index], :]
|
|
self._current_index += 1
|
|
|
|
return group_name, group_data
|
|
|
|
def agg(
|
|
self,
|
|
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
**named_aggs: IntoExpr,
|
|
) -> DataFrame:
|
|
"""
|
|
Compute aggregations for each group of a group by operation.
|
|
|
|
Parameters
|
|
----------
|
|
*aggs
|
|
Aggregations to compute for each group of the group by operation,
|
|
specified as positional arguments.
|
|
Accepts expression input. Strings are parsed as column names.
|
|
**named_aggs
|
|
Additional aggregations, specified as keyword arguments.
|
|
The resulting columns will be renamed to the keyword used.
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.rolling(
|
|
index_column=self.time_column,
|
|
period=self.period,
|
|
offset=self.offset,
|
|
closed=self.closed,
|
|
group_by=self.group_by,
|
|
)
|
|
.agg(*aggs, **named_aggs)
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
def map_groups(
|
|
self,
|
|
function: Callable[[DataFrame], DataFrame],
|
|
schema: SchemaDict | None,
|
|
) -> DataFrame:
|
|
"""
|
|
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
|
|
|
|
Using this is considered an anti-pattern as it will be very slow because:
|
|
|
|
- it forces the engine to materialize the whole `DataFrames` for the groups.
|
|
- it is not parallelized.
|
|
- it blocks optimizations as the passed python function is opaque to the
|
|
optimizer.
|
|
|
|
The idiomatic way to apply custom functions over multiple columns is using:
|
|
|
|
`pl.struct([my_columns]).map_elements(lambda struct_series: ..)`
|
|
|
|
Parameters
|
|
----------
|
|
function
|
|
Function to apply over each group of the `LazyFrame`; it receives
|
|
a DataFrame and should return a DataFrame.
|
|
schema
|
|
Schema of the output function. This has to be known statically. If the
|
|
given schema is incorrect, this is a bug in the caller's query and may
|
|
lead to errors. If set to None, polars assumes the schema is unchanged.
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.rolling(
|
|
index_column=self.time_column,
|
|
period=self.period,
|
|
offset=self.offset,
|
|
closed=self.closed,
|
|
group_by=self.group_by,
|
|
)
|
|
.map_groups(function, schema)
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
|
|
class DynamicGroupBy:
|
|
"""
|
|
A dynamic grouper.
|
|
|
|
This has an `.agg` method which allows you to run all polars expressions in a
|
|
group by context.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
df: DataFrame,
|
|
index_column: IntoExpr,
|
|
*,
|
|
every: str | timedelta,
|
|
period: str | timedelta | None,
|
|
offset: str | timedelta | None,
|
|
include_boundaries: bool,
|
|
closed: ClosedInterval,
|
|
label: Label,
|
|
group_by: IntoExpr | Iterable[IntoExpr] | None,
|
|
start_by: StartBy,
|
|
) -> None:
|
|
every = parse_as_duration_string(every)
|
|
period = parse_as_duration_string(period)
|
|
offset = parse_as_duration_string(offset)
|
|
|
|
self.df = df
|
|
self.time_column = index_column
|
|
self.every = every
|
|
self.period = period
|
|
self.offset = offset
|
|
self.label = label
|
|
self.include_boundaries = include_boundaries
|
|
self.closed = closed
|
|
self.group_by = group_by
|
|
self.start_by = start_by
|
|
|
|
def __iter__(self) -> Self:
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
|
groups_df = (
|
|
self.df.lazy()
|
|
.with_row_index()
|
|
.group_by_dynamic(
|
|
index_column=self.time_column,
|
|
every=self.every,
|
|
period=self.period,
|
|
offset=self.offset,
|
|
label=self.label,
|
|
include_boundaries=self.include_boundaries,
|
|
closed=self.closed,
|
|
group_by=self.group_by,
|
|
start_by=self.start_by,
|
|
)
|
|
.agg(F.first().alias(temp_col))
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
self._group_names = groups_df.select(F.all().exclude(temp_col)).iter_rows()
|
|
self._group_indices = groups_df.select(temp_col).to_series()
|
|
self._current_index = 0
|
|
|
|
return self
|
|
|
|
def __next__(self) -> tuple[tuple[object, ...], DataFrame]:
|
|
if self._current_index >= len(self._group_indices):
|
|
raise StopIteration
|
|
|
|
group_name = next(self._group_names)
|
|
group_data = self.df[self._group_indices[self._current_index], :]
|
|
self._current_index += 1
|
|
|
|
return group_name, group_data
|
|
|
|
def agg(
|
|
self,
|
|
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
**named_aggs: IntoExpr,
|
|
) -> DataFrame:
|
|
"""
|
|
Compute aggregations for each group of a group by operation.
|
|
|
|
Parameters
|
|
----------
|
|
*aggs
|
|
Aggregations to compute for each group of the group by operation,
|
|
specified as positional arguments.
|
|
Accepts expression input. Strings are parsed as column names.
|
|
**named_aggs
|
|
Additional aggregations, specified as keyword arguments.
|
|
The resulting columns will be renamed to the keyword used.
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.group_by_dynamic(
|
|
index_column=self.time_column,
|
|
every=self.every,
|
|
period=self.period,
|
|
offset=self.offset,
|
|
label=self.label,
|
|
include_boundaries=self.include_boundaries,
|
|
closed=self.closed,
|
|
group_by=self.group_by,
|
|
start_by=self.start_by,
|
|
)
|
|
.agg(*aggs, **named_aggs)
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|
|
|
|
def map_groups(
|
|
self,
|
|
function: Callable[[DataFrame], DataFrame],
|
|
schema: SchemaDict | None,
|
|
) -> DataFrame:
|
|
"""
|
|
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
|
|
|
|
Using this is considered an anti-pattern as it will be very slow because:
|
|
|
|
- it forces the engine to materialize the whole `DataFrames` for the groups.
|
|
- it is not parallelized.
|
|
- it blocks optimizations as the passed python function is opaque to the
|
|
optimizer.
|
|
|
|
The idiomatic way to apply custom functions over multiple columns is using:
|
|
|
|
`pl.struct([my_columns]).map_elements(lambda struct_series: ..)`
|
|
|
|
Parameters
|
|
----------
|
|
function
|
|
Function to apply over each group of the `LazyFrame`; it receives
|
|
a DataFrame and should return a DataFrame.
|
|
schema
|
|
Schema of the output function. This has to be known statically. If the
|
|
given schema is incorrect, this is a bug in the caller's query and may
|
|
lead to errors. If set to None, polars assumes the schema is unchanged.
|
|
"""
|
|
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
|
|
return (
|
|
self.df.lazy()
|
|
.group_by_dynamic(
|
|
index_column=self.time_column,
|
|
every=self.every,
|
|
period=self.period,
|
|
offset=self.offset,
|
|
include_boundaries=self.include_boundaries,
|
|
closed=self.closed,
|
|
group_by=self.group_by,
|
|
start_by=self.start_by,
|
|
)
|
|
.map_groups(function, schema)
|
|
.collect(optimizations=QueryOptFlags.none())
|
|
)
|