DriverTrac/venv/lib/python3.12/site-packages/polars/lazyframe/group_by.py

670 lines
25 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING, Callable
from polars import functions as F
from polars._utils.deprecation import deprecated
from polars._utils.parse import parse_into_list_of_expressions
from polars._utils.wrap import wrap_df, wrap_ldf
if TYPE_CHECKING:
import sys
from collections.abc import Iterable
from polars import DataFrame, LazyFrame
from polars._plr import PyLazyGroupBy
from polars._typing import IntoExpr, QuantileMethod, SchemaDict
if sys.version_info >= (3, 13):
from warnings import deprecated
else:
from typing_extensions import deprecated # noqa: TC004
class LazyGroupBy:
"""
Utility class for performing a group by operation over a lazy DataFrame.
Generated by calling `df.lazy().group_by(...)`.
"""
def __init__(self, lgb: PyLazyGroupBy) -> None:
self.lgb = lgb
def agg(
self,
*aggs: IntoExpr | Iterable[IntoExpr],
**named_aggs: IntoExpr,
) -> LazyFrame:
"""
Compute aggregations for each group of a group by operation.
Parameters
----------
*aggs
Aggregations to compute for each group of the group by operation,
specified as positional arguments.
Accepts expression input. Strings are parsed as column names.
**named_aggs
Additional aggregations, specified as keyword arguments.
The resulting columns will be renamed to the keyword used.
Examples
--------
Compute the aggregation of the columns for each group.
>>> ldf = pl.DataFrame(
... {
... "a": ["a", "b", "a", "b", "c"],
... "b": [1, 2, 1, 3, 3],
... "c": [5, 4, 3, 2, 1],
... }
... ).lazy()
>>> ldf.group_by("a").agg(
... [pl.col("b"), pl.col("c")]
... ).collect() # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬───────────┬───────────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ list[i64] ┆ list[i64] │
╞═════╪═══════════╪═══════════╡
│ a ┆ [1, 1] ┆ [5, 3] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ [2, 3] ┆ [4, 2] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ [3] ┆ [1] │
└─────┴───────────┴───────────┘
Compute the sum of a column for each group.
>>> ldf.group_by("a").agg(
... pl.col("b").sum()
... ).collect() # doctest: +IGNORE_RESULT
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ a ┆ 2 │
│ b ┆ 5 │
│ c ┆ 3 │
└─────┴─────┘
Compute multiple aggregates at once by passing a list of expressions.
>>> ldf.group_by("a").agg(
... [pl.sum("b"), pl.mean("c")]
... ).collect() # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═════╪═════╡
│ c ┆ 3 ┆ 1.0 │
│ a ┆ 2 ┆ 4.0 │
│ b ┆ 5 ┆ 3.0 │
└─────┴─────┴─────┘
Or use positional arguments to compute multiple aggregations in the same way.
>>> ldf.group_by("a").agg(
... pl.sum("b").name.suffix("_sum"),
... (pl.col("c") ** 2).mean().name.suffix("_mean_squared"),
... ).collect() # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬───────┬────────────────┐
│ a ┆ b_sum ┆ c_mean_squared │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═══════╪════════════════╡
│ a ┆ 2 ┆ 17.0 │
│ c ┆ 3 ┆ 1.0 │
│ b ┆ 5 ┆ 10.0 │
└─────┴───────┴────────────────┘
Use keyword arguments to easily name your expression inputs.
>>> ldf.group_by("a").agg(
... b_sum=pl.sum("b"),
... c_mean_squared=(pl.col("c") ** 2).mean(),
... ).collect() # doctest: +IGNORE_RESULT
shape: (3, 3)
┌─────┬───────┬────────────────┐
│ a ┆ b_sum ┆ c_mean_squared │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═══════╪════════════════╡
│ a ┆ 2 ┆ 17.0 │
│ c ┆ 3 ┆ 1.0 │
│ b ┆ 5 ┆ 10.0 │
└─────┴───────┴────────────────┘
"""
if aggs and isinstance(aggs[0], dict):
msg = (
"specifying aggregations as a dictionary is not supported"
"\n\nTry unpacking the dictionary to take advantage of the keyword syntax"
" of the `agg` method."
)
raise TypeError(msg)
pyexprs = parse_into_list_of_expressions(*aggs, **named_aggs)
return wrap_ldf(self.lgb.agg(pyexprs))
def map_groups(
self,
function: Callable[[DataFrame], DataFrame],
schema: SchemaDict | None,
) -> LazyFrame:
"""
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
.. warning::
This method is much slower than the native expressions API.
Only use it if you cannot implement your logic otherwise.
Using this is considered an anti-pattern as it will be very slow because:
- it forces the engine to materialize the whole `DataFrames` for the groups.
- it is not parallelized
- it blocks optimizations as the passed python function is opaque to the
optimizer
The idiomatic way to apply custom functions over multiple columns is using:
`pl.struct([my_columns]).apply(lambda struct_series: ..)`
Parameters
----------
function
Function to apply over each group of the `LazyFrame`.
schema
Schema of the output function. This has to be known statically. If the
given schema is incorrect, this is a bug in the caller's query and may
lead to errors. If set to None, polars assumes the schema is unchanged.
Examples
--------
For each color group sample two rows:
>>> df = pl.DataFrame(
... {
... "id": [0, 1, 2, 3, 4],
... "color": ["red", "green", "green", "red", "red"],
... "shape": ["square", "triangle", "square", "triangle", "square"],
... }
... )
>>> (
... df.lazy()
... .group_by("color")
... .map_groups(lambda group_df: group_df.sample(2), schema=None)
... .collect()
... ) # doctest: +IGNORE_RESULT
shape: (4, 3)
┌─────┬───────┬──────────┐
│ id ┆ color ┆ shape │
│ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str │
╞═════╪═══════╪══════════╡
│ 1 ┆ green ┆ triangle │
│ 2 ┆ green ┆ square │
│ 4 ┆ red ┆ square │
│ 3 ┆ red ┆ triangle │
└─────┴───────┴──────────┘
It is better to implement this with an expression:
>>> df.lazy().filter(
... pl.int_range(pl.len()).shuffle().over("color") < 2
... ).collect() # doctest: +IGNORE_RESULT
"""
return wrap_ldf(
self.lgb.map_groups(lambda df: function(wrap_df(df))._df, schema)
)
def head(self, n: int = 5) -> LazyFrame:
"""
Get the first `n` rows of each group.
Parameters
----------
n
Number of rows to return.
Examples
--------
>>> df = pl.DataFrame(
... {
... "letters": ["c", "c", "a", "c", "a", "b"],
... "nrs": [1, 2, 3, 4, 5, 6],
... }
... )
>>> df
shape: (6, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ c ┆ 1 │
│ c ┆ 2 │
│ a ┆ 3 │
│ c ┆ 4 │
│ a ┆ 5 │
│ b ┆ 6 │
└─────────┴─────┘
>>> df.group_by("letters").head(2).sort("letters")
shape: (5, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ a ┆ 3 │
│ a ┆ 5 │
│ b ┆ 6 │
│ c ┆ 1 │
│ c ┆ 2 │
└─────────┴─────┘
"""
return wrap_ldf(self.lgb.head(n))
def tail(self, n: int = 5) -> LazyFrame:
"""
Get the last `n` rows of each group.
Parameters
----------
n
Number of rows to return.
Examples
--------
>>> df = pl.DataFrame(
... {
... "letters": ["c", "c", "a", "c", "a", "b"],
... "nrs": [1, 2, 3, 4, 5, 6],
... }
... )
>>> df
shape: (6, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ c ┆ 1 │
│ c ┆ 2 │
│ a ┆ 3 │
│ c ┆ 4 │
│ a ┆ 5 │
│ b ┆ 6 │
└─────────┴─────┘
>>> df.group_by("letters").tail(2).sort("letters")
shape: (5, 2)
┌─────────┬─────┐
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ a ┆ 3 │
│ a ┆ 5 │
│ b ┆ 6 │
│ c ┆ 2 │
│ c ┆ 4 │
└─────────┴─────┘
"""
return wrap_ldf(self.lgb.tail(n))
def all(self) -> LazyFrame:
"""
Aggregate the groups into Series.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": ["one", "two", "one", "two"],
... "b": [1, 2, 3, 4],
... }
... ).lazy()
>>> ldf.group_by("a", maintain_order=True).all().collect()
shape: (2, 2)
┌─────┬───────────┐
│ a ┆ b │
│ --- ┆ --- │
│ str ┆ list[i64] │
╞═════╪═══════════╡
│ one ┆ [1, 3] │
│ two ┆ [2, 4] │
└─────┴───────────┘
"""
return self.agg(F.all())
def len(self, name: str | None = None) -> LazyFrame:
"""
Return the number of rows in each group.
Parameters
----------
name
Assign a name to the resulting column; if unset, defaults to "len".
Examples
--------
>>> lf = pl.LazyFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
>>> lf.group_by("a").len().collect() # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
>>> lf.group_by("a").len(name="n").collect() # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ n │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
"""
len_expr = F.len()
if name is not None:
len_expr = len_expr.alias(name)
return self.agg(len_expr)
@deprecated("`count` was renamed; use `len` instead")
def count(self) -> LazyFrame:
"""
Return the number of rows in each group.
.. deprecated:: 0.20.5
This method has been renamed to :func:`LazyGroupBy.len`.
Rows containing null values count towards the total.
Examples
--------
>>> lf = pl.LazyFrame(
... {
... "a": ["Apple", "Apple", "Orange"],
... "b": [1, None, 2],
... }
... )
>>> lf.group_by("a").count().collect() # doctest: +SKIP
shape: (2, 2)
┌────────┬───────┐
│ a ┆ count │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═══════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴───────┘
"""
return self.agg(F.len().alias("count"))
def first(self) -> LazyFrame:
"""
Aggregate the first values in the group.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).first().collect()
shape: (3, 4)
┌────────┬─────┬──────┬───────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪═══════╡
│ Apple ┆ 1 ┆ 0.5 ┆ true │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 4 ┆ 13.0 ┆ false │
└────────┴─────┴──────┴───────┘
"""
return self.agg(F.all().first())
def last(self) -> LazyFrame:
"""
Aggregate the last values in the group.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 14, 13],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).last().collect()
shape: (3, 4)
┌────────┬─────┬──────┬───────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪═══════╡
│ Apple ┆ 3 ┆ 10.0 ┆ false │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 5 ┆ 13.0 ┆ true │
└────────┴─────┴──────┴───────┘
"""
return self.agg(F.all().last())
def max(self) -> LazyFrame:
"""
Reduce the groups to the maximal value.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).max().collect()
shape: (3, 4)
┌────────┬─────┬──────┬──────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪══════╡
│ Apple ┆ 3 ┆ 10.0 ┆ true │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 5 ┆ 14.0 ┆ true │
└────────┴─────┴──────┴──────┘
"""
return self.agg(F.all().max())
def mean(self) -> LazyFrame:
"""
Reduce the groups to the mean values.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).mean().collect()
shape: (3, 4)
┌────────┬─────┬──────────┬──────────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞════════╪═════╪══════════╪══════════╡
│ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
│ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
│ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
└────────┴─────┴──────────┴──────────┘
"""
return self.agg(F.all().mean())
def median(self) -> LazyFrame:
"""
Return the median per group.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).median().collect()
shape: (2, 3)
┌────────┬─────┬──────┐
│ d ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞════════╪═════╪══════╡
│ Apple ┆ 2.0 ┆ 4.0 │
│ Banana ┆ 4.0 ┆ 13.0 │
└────────┴─────┴──────┘
"""
return self.agg(F.all().median())
def min(self) -> LazyFrame:
"""
Reduce the groups to the minimal value.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).min().collect()
shape: (3, 4)
┌────────┬─────┬──────┬───────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ bool │
╞════════╪═════╪══════╪═══════╡
│ Apple ┆ 1 ┆ 0.5 ┆ false │
│ Orange ┆ 2 ┆ 0.5 ┆ true │
│ Banana ┆ 4 ┆ 13.0 ┆ false │
└────────┴─────┴──────┴───────┘
"""
return self.agg(F.all().min())
def n_unique(self) -> LazyFrame:
"""
Count the unique values per group.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 1, 3, 4, 5],
... "b": [0.5, 0.5, 0.5, 10, 13, 14],
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).n_unique().collect()
shape: (2, 3)
┌────────┬─────┬─────┐
│ d ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞════════╪═════╪═════╡
│ Apple ┆ 2 ┆ 2 │
│ Banana ┆ 3 ┆ 3 │
└────────┴─────┴─────┘
"""
return self.agg(F.all().n_unique())
def quantile(
self, quantile: float, interpolation: QuantileMethod = "nearest"
) -> LazyFrame:
"""
Compute the quantile per group.
Parameters
----------
quantile
Quantile between 0.0 and 1.0.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
Interpolation method.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).quantile(1).collect()
shape: (3, 3)
┌────────┬─────┬──────┐
│ d ┆ a ┆ b │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞════════╪═════╪══════╡
│ Apple ┆ 3.0 ┆ 10.0 │
│ Orange ┆ 2.0 ┆ 0.5 │
│ Banana ┆ 5.0 ┆ 14.0 │
└────────┴─────┴──────┘
""" # noqa: W505
return self.agg(F.all().quantile(quantile, interpolation=interpolation))
def sum(self) -> LazyFrame:
"""
Reduce the groups to the sum.
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 2, 3, 4, 5],
... "b": [0.5, 0.5, 4, 10, 13, 14],
... "c": [True, True, True, False, False, True],
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
... }
... ).lazy()
>>> ldf.group_by("d", maintain_order=True).sum().collect()
shape: (3, 4)
┌────────┬─────┬──────┬─────┐
│ d ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ u32 │
╞════════╪═════╪══════╪═════╡
│ Apple ┆ 6 ┆ 14.5 ┆ 2 │
│ Orange ┆ 2 ┆ 0.5 ┆ 1 │
│ Banana ┆ 9 ┆ 27.0 ┆ 1 │
└────────┴─────┴──────┴─────┘
"""
return self.agg(F.all().sum())