670 lines
25 KiB
Python
670 lines
25 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, Callable
|
|
|
|
from polars import functions as F
|
|
from polars._utils.deprecation import deprecated
|
|
from polars._utils.parse import parse_into_list_of_expressions
|
|
from polars._utils.wrap import wrap_df, wrap_ldf
|
|
|
|
if TYPE_CHECKING:
|
|
import sys
|
|
from collections.abc import Iterable
|
|
|
|
from polars import DataFrame, LazyFrame
|
|
from polars._plr import PyLazyGroupBy
|
|
from polars._typing import IntoExpr, QuantileMethod, SchemaDict
|
|
|
|
if sys.version_info >= (3, 13):
|
|
from warnings import deprecated
|
|
else:
|
|
from typing_extensions import deprecated # noqa: TC004
|
|
|
|
|
|
class LazyGroupBy:
|
|
"""
|
|
Utility class for performing a group by operation over a lazy DataFrame.
|
|
|
|
Generated by calling `df.lazy().group_by(...)`.
|
|
"""
|
|
|
|
def __init__(self, lgb: PyLazyGroupBy) -> None:
|
|
self.lgb = lgb
|
|
|
|
def agg(
|
|
self,
|
|
*aggs: IntoExpr | Iterable[IntoExpr],
|
|
**named_aggs: IntoExpr,
|
|
) -> LazyFrame:
|
|
"""
|
|
Compute aggregations for each group of a group by operation.
|
|
|
|
Parameters
|
|
----------
|
|
*aggs
|
|
Aggregations to compute for each group of the group by operation,
|
|
specified as positional arguments.
|
|
Accepts expression input. Strings are parsed as column names.
|
|
**named_aggs
|
|
Additional aggregations, specified as keyword arguments.
|
|
The resulting columns will be renamed to the keyword used.
|
|
|
|
Examples
|
|
--------
|
|
Compute the aggregation of the columns for each group.
|
|
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": ["a", "b", "a", "b", "c"],
|
|
... "b": [1, 2, 1, 3, 3],
|
|
... "c": [5, 4, 3, 2, 1],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("a").agg(
|
|
... [pl.col("b"), pl.col("c")]
|
|
... ).collect() # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬───────────┬───────────┐
|
|
│ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ list[i64] ┆ list[i64] │
|
|
╞═════╪═══════════╪═══════════╡
|
|
│ a ┆ [1, 1] ┆ [5, 3] │
|
|
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
|
│ b ┆ [2, 3] ┆ [4, 2] │
|
|
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
|
│ c ┆ [3] ┆ [1] │
|
|
└─────┴───────────┴───────────┘
|
|
|
|
Compute the sum of a column for each group.
|
|
|
|
>>> ldf.group_by("a").agg(
|
|
... pl.col("b").sum()
|
|
... ).collect() # doctest: +IGNORE_RESULT
|
|
shape: (3, 2)
|
|
┌─────┬─────┐
|
|
│ a ┆ b │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ a ┆ 2 │
|
|
│ b ┆ 5 │
|
|
│ c ┆ 3 │
|
|
└─────┴─────┘
|
|
|
|
Compute multiple aggregates at once by passing a list of expressions.
|
|
|
|
>>> ldf.group_by("a").agg(
|
|
... [pl.sum("b"), pl.mean("c")]
|
|
... ).collect() # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═════╪═════╡
|
|
│ c ┆ 3 ┆ 1.0 │
|
|
│ a ┆ 2 ┆ 4.0 │
|
|
│ b ┆ 5 ┆ 3.0 │
|
|
└─────┴─────┴─────┘
|
|
|
|
Or use positional arguments to compute multiple aggregations in the same way.
|
|
|
|
>>> ldf.group_by("a").agg(
|
|
... pl.sum("b").name.suffix("_sum"),
|
|
... (pl.col("c") ** 2).mean().name.suffix("_mean_squared"),
|
|
... ).collect() # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬───────┬────────────────┐
|
|
│ a ┆ b_sum ┆ c_mean_squared │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═══════╪════════════════╡
|
|
│ a ┆ 2 ┆ 17.0 │
|
|
│ c ┆ 3 ┆ 1.0 │
|
|
│ b ┆ 5 ┆ 10.0 │
|
|
└─────┴───────┴────────────────┘
|
|
|
|
Use keyword arguments to easily name your expression inputs.
|
|
|
|
>>> ldf.group_by("a").agg(
|
|
... b_sum=pl.sum("b"),
|
|
... c_mean_squared=(pl.col("c") ** 2).mean(),
|
|
... ).collect() # doctest: +IGNORE_RESULT
|
|
shape: (3, 3)
|
|
┌─────┬───────┬────────────────┐
|
|
│ a ┆ b_sum ┆ c_mean_squared │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═══════╪════════════════╡
|
|
│ a ┆ 2 ┆ 17.0 │
|
|
│ c ┆ 3 ┆ 1.0 │
|
|
│ b ┆ 5 ┆ 10.0 │
|
|
└─────┴───────┴────────────────┘
|
|
"""
|
|
if aggs and isinstance(aggs[0], dict):
|
|
msg = (
|
|
"specifying aggregations as a dictionary is not supported"
|
|
"\n\nTry unpacking the dictionary to take advantage of the keyword syntax"
|
|
" of the `agg` method."
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
pyexprs = parse_into_list_of_expressions(*aggs, **named_aggs)
|
|
return wrap_ldf(self.lgb.agg(pyexprs))
|
|
|
|
def map_groups(
|
|
self,
|
|
function: Callable[[DataFrame], DataFrame],
|
|
schema: SchemaDict | None,
|
|
) -> LazyFrame:
|
|
"""
|
|
Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
|
|
|
|
.. warning::
|
|
This method is much slower than the native expressions API.
|
|
Only use it if you cannot implement your logic otherwise.
|
|
|
|
Using this is considered an anti-pattern as it will be very slow because:
|
|
|
|
- it forces the engine to materialize the whole `DataFrames` for the groups.
|
|
- it is not parallelized
|
|
- it blocks optimizations as the passed python function is opaque to the
|
|
optimizer
|
|
|
|
The idiomatic way to apply custom functions over multiple columns is using:
|
|
|
|
`pl.struct([my_columns]).apply(lambda struct_series: ..)`
|
|
|
|
Parameters
|
|
----------
|
|
function
|
|
Function to apply over each group of the `LazyFrame`.
|
|
schema
|
|
Schema of the output function. This has to be known statically. If the
|
|
given schema is incorrect, this is a bug in the caller's query and may
|
|
lead to errors. If set to None, polars assumes the schema is unchanged.
|
|
|
|
Examples
|
|
--------
|
|
For each color group sample two rows:
|
|
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "id": [0, 1, 2, 3, 4],
|
|
... "color": ["red", "green", "green", "red", "red"],
|
|
... "shape": ["square", "triangle", "square", "triangle", "square"],
|
|
... }
|
|
... )
|
|
>>> (
|
|
... df.lazy()
|
|
... .group_by("color")
|
|
... .map_groups(lambda group_df: group_df.sample(2), schema=None)
|
|
... .collect()
|
|
... ) # doctest: +IGNORE_RESULT
|
|
shape: (4, 3)
|
|
┌─────┬───────┬──────────┐
|
|
│ id ┆ color ┆ shape │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ str ┆ str │
|
|
╞═════╪═══════╪══════════╡
|
|
│ 1 ┆ green ┆ triangle │
|
|
│ 2 ┆ green ┆ square │
|
|
│ 4 ┆ red ┆ square │
|
|
│ 3 ┆ red ┆ triangle │
|
|
└─────┴───────┴──────────┘
|
|
|
|
It is better to implement this with an expression:
|
|
|
|
>>> df.lazy().filter(
|
|
... pl.int_range(pl.len()).shuffle().over("color") < 2
|
|
... ).collect() # doctest: +IGNORE_RESULT
|
|
"""
|
|
return wrap_ldf(
|
|
self.lgb.map_groups(lambda df: function(wrap_df(df))._df, schema)
|
|
)
|
|
|
|
def head(self, n: int = 5) -> LazyFrame:
|
|
"""
|
|
Get the first `n` rows of each group.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of rows to return.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "letters": ["c", "c", "a", "c", "a", "b"],
|
|
... "nrs": [1, 2, 3, 4, 5, 6],
|
|
... }
|
|
... )
|
|
>>> df
|
|
shape: (6, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ c ┆ 1 │
|
|
│ c ┆ 2 │
|
|
│ a ┆ 3 │
|
|
│ c ┆ 4 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
└─────────┴─────┘
|
|
>>> df.group_by("letters").head(2).sort("letters")
|
|
shape: (5, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ a ┆ 3 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
│ c ┆ 1 │
|
|
│ c ┆ 2 │
|
|
└─────────┴─────┘
|
|
"""
|
|
return wrap_ldf(self.lgb.head(n))
|
|
|
|
def tail(self, n: int = 5) -> LazyFrame:
|
|
"""
|
|
Get the last `n` rows of each group.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of rows to return.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "letters": ["c", "c", "a", "c", "a", "b"],
|
|
... "nrs": [1, 2, 3, 4, 5, 6],
|
|
... }
|
|
... )
|
|
>>> df
|
|
shape: (6, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ c ┆ 1 │
|
|
│ c ┆ 2 │
|
|
│ a ┆ 3 │
|
|
│ c ┆ 4 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
└─────────┴─────┘
|
|
>>> df.group_by("letters").tail(2).sort("letters")
|
|
shape: (5, 2)
|
|
┌─────────┬─────┐
|
|
│ letters ┆ nrs │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════════╪═════╡
|
|
│ a ┆ 3 │
|
|
│ a ┆ 5 │
|
|
│ b ┆ 6 │
|
|
│ c ┆ 2 │
|
|
│ c ┆ 4 │
|
|
└─────────┴─────┘
|
|
"""
|
|
return wrap_ldf(self.lgb.tail(n))
|
|
|
|
def all(self) -> LazyFrame:
|
|
"""
|
|
Aggregate the groups into Series.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": ["one", "two", "one", "two"],
|
|
... "b": [1, 2, 3, 4],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("a", maintain_order=True).all().collect()
|
|
shape: (2, 2)
|
|
┌─────┬───────────┐
|
|
│ a ┆ b │
|
|
│ --- ┆ --- │
|
|
│ str ┆ list[i64] │
|
|
╞═════╪═══════════╡
|
|
│ one ┆ [1, 3] │
|
|
│ two ┆ [2, 4] │
|
|
└─────┴───────────┘
|
|
"""
|
|
return self.agg(F.all())
|
|
|
|
def len(self, name: str | None = None) -> LazyFrame:
|
|
"""
|
|
Return the number of rows in each group.
|
|
|
|
Parameters
|
|
----------
|
|
name
|
|
Assign a name to the resulting column; if unset, defaults to "len".
|
|
|
|
Examples
|
|
--------
|
|
>>> lf = pl.LazyFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
|
|
>>> lf.group_by("a").len().collect() # doctest: +IGNORE_RESULT
|
|
shape: (2, 2)
|
|
┌────────┬─────┐
|
|
│ a ┆ len │
|
|
│ --- ┆ --- │
|
|
│ str ┆ u32 │
|
|
╞════════╪═════╡
|
|
│ Apple ┆ 2 │
|
|
│ Orange ┆ 1 │
|
|
└────────┴─────┘
|
|
>>> lf.group_by("a").len(name="n").collect() # doctest: +IGNORE_RESULT
|
|
shape: (2, 2)
|
|
┌────────┬─────┐
|
|
│ a ┆ n │
|
|
│ --- ┆ --- │
|
|
│ str ┆ u32 │
|
|
╞════════╪═════╡
|
|
│ Apple ┆ 2 │
|
|
│ Orange ┆ 1 │
|
|
└────────┴─────┘
|
|
"""
|
|
len_expr = F.len()
|
|
if name is not None:
|
|
len_expr = len_expr.alias(name)
|
|
return self.agg(len_expr)
|
|
|
|
@deprecated("`count` was renamed; use `len` instead")
|
|
def count(self) -> LazyFrame:
|
|
"""
|
|
Return the number of rows in each group.
|
|
|
|
.. deprecated:: 0.20.5
|
|
This method has been renamed to :func:`LazyGroupBy.len`.
|
|
|
|
Rows containing null values count towards the total.
|
|
|
|
Examples
|
|
--------
|
|
>>> lf = pl.LazyFrame(
|
|
... {
|
|
... "a": ["Apple", "Apple", "Orange"],
|
|
... "b": [1, None, 2],
|
|
... }
|
|
... )
|
|
>>> lf.group_by("a").count().collect() # doctest: +SKIP
|
|
shape: (2, 2)
|
|
┌────────┬───────┐
|
|
│ a ┆ count │
|
|
│ --- ┆ --- │
|
|
│ str ┆ u32 │
|
|
╞════════╪═══════╡
|
|
│ Apple ┆ 2 │
|
|
│ Orange ┆ 1 │
|
|
└────────┴───────┘
|
|
"""
|
|
return self.agg(F.len().alias("count"))
|
|
|
|
def first(self) -> LazyFrame:
|
|
"""
|
|
Aggregate the first values in the group.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).first().collect()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬───────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪═══════╡
|
|
│ Apple ┆ 1 ┆ 0.5 ┆ true │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 4 ┆ 13.0 ┆ false │
|
|
└────────┴─────┴──────┴───────┘
|
|
"""
|
|
return self.agg(F.all().first())
|
|
|
|
def last(self) -> LazyFrame:
|
|
"""
|
|
Aggregate the last values in the group.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 14, 13],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).last().collect()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬───────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪═══════╡
|
|
│ Apple ┆ 3 ┆ 10.0 ┆ false │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 5 ┆ 13.0 ┆ true │
|
|
└────────┴─────┴──────┴───────┘
|
|
"""
|
|
return self.agg(F.all().last())
|
|
|
|
def max(self) -> LazyFrame:
|
|
"""
|
|
Reduce the groups to the maximal value.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).max().collect()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬──────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪══════╡
|
|
│ Apple ┆ 3 ┆ 10.0 ┆ true │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 5 ┆ 14.0 ┆ true │
|
|
└────────┴─────┴──────┴──────┘
|
|
"""
|
|
return self.agg(F.all().max())
|
|
|
|
def mean(self) -> LazyFrame:
|
|
"""
|
|
Reduce the groups to the mean values.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).mean().collect()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────────┬──────────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 │
|
|
╞════════╪═════╪══════════╪══════════╡
|
|
│ Apple ┆ 2.0 ┆ 4.833333 ┆ 0.666667 │
|
|
│ Orange ┆ 2.0 ┆ 0.5 ┆ 1.0 │
|
|
│ Banana ┆ 4.5 ┆ 13.5 ┆ 0.5 │
|
|
└────────┴─────┴──────────┴──────────┘
|
|
"""
|
|
return self.agg(F.all().mean())
|
|
|
|
def median(self) -> LazyFrame:
|
|
"""
|
|
Return the median per group.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).median().collect()
|
|
shape: (2, 3)
|
|
┌────────┬─────┬──────┐
|
|
│ d ┆ a ┆ b │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 │
|
|
╞════════╪═════╪══════╡
|
|
│ Apple ┆ 2.0 ┆ 4.0 │
|
|
│ Banana ┆ 4.0 ┆ 13.0 │
|
|
└────────┴─────┴──────┘
|
|
"""
|
|
return self.agg(F.all().median())
|
|
|
|
def min(self) -> LazyFrame:
|
|
"""
|
|
Reduce the groups to the minimal value.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).min().collect()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬───────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ bool │
|
|
╞════════╪═════╪══════╪═══════╡
|
|
│ Apple ┆ 1 ┆ 0.5 ┆ false │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ true │
|
|
│ Banana ┆ 4 ┆ 13.0 ┆ false │
|
|
└────────┴─────┴──────┴───────┘
|
|
"""
|
|
return self.agg(F.all().min())
|
|
|
|
def n_unique(self) -> LazyFrame:
|
|
"""
|
|
Count the unique values per group.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 1, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 0.5, 10, 13, 14],
|
|
... "d": ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).n_unique().collect()
|
|
shape: (2, 3)
|
|
┌────────┬─────┬─────┐
|
|
│ d ┆ a ┆ b │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ u32 ┆ u32 │
|
|
╞════════╪═════╪═════╡
|
|
│ Apple ┆ 2 ┆ 2 │
|
|
│ Banana ┆ 3 ┆ 3 │
|
|
└────────┴─────┴─────┘
|
|
"""
|
|
return self.agg(F.all().n_unique())
|
|
|
|
def quantile(
|
|
self, quantile: float, interpolation: QuantileMethod = "nearest"
|
|
) -> LazyFrame:
|
|
"""
|
|
Compute the quantile per group.
|
|
|
|
Parameters
|
|
----------
|
|
quantile
|
|
Quantile between 0.0 and 1.0.
|
|
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
|
|
Interpolation method.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).quantile(1).collect()
|
|
shape: (3, 3)
|
|
┌────────┬─────┬──────┐
|
|
│ d ┆ a ┆ b │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 │
|
|
╞════════╪═════╪══════╡
|
|
│ Apple ┆ 3.0 ┆ 10.0 │
|
|
│ Orange ┆ 2.0 ┆ 0.5 │
|
|
│ Banana ┆ 5.0 ┆ 14.0 │
|
|
└────────┴─────┴──────┘
|
|
""" # noqa: W505
|
|
return self.agg(F.all().quantile(quantile, interpolation=interpolation))
|
|
|
|
def sum(self) -> LazyFrame:
|
|
"""
|
|
Reduce the groups to the sum.
|
|
|
|
Examples
|
|
--------
|
|
>>> ldf = pl.DataFrame(
|
|
... {
|
|
... "a": [1, 2, 2, 3, 4, 5],
|
|
... "b": [0.5, 0.5, 4, 10, 13, 14],
|
|
... "c": [True, True, True, False, False, True],
|
|
... "d": ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
|
... }
|
|
... ).lazy()
|
|
>>> ldf.group_by("d", maintain_order=True).sum().collect()
|
|
shape: (3, 4)
|
|
┌────────┬─────┬──────┬─────┐
|
|
│ d ┆ a ┆ b ┆ c │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 ┆ u32 │
|
|
╞════════╪═════╪══════╪═════╡
|
|
│ Apple ┆ 6 ┆ 14.5 ┆ 2 │
|
|
│ Orange ┆ 2 ┆ 0.5 ┆ 1 │
|
|
│ Banana ┆ 9 ┆ 27.0 ┆ 1 │
|
|
└────────┴─────┴──────┴─────┘
|
|
"""
|
|
return self.agg(F.all().sum())
|