1506 lines
58 KiB
Python
1506 lines
58 KiB
Python
from __future__ import annotations
|
|
|
|
import copy
|
|
from collections.abc import Collection, Sequence
|
|
from typing import TYPE_CHECKING, Any, Callable
|
|
|
|
import polars._reexport as pl
|
|
from polars import exceptions
|
|
from polars import functions as F
|
|
from polars._utils.parse import parse_into_expression
|
|
from polars._utils.unstable import unstable
|
|
from polars._utils.various import issue_warning
|
|
from polars._utils.wrap import wrap_expr
|
|
|
|
if TYPE_CHECKING:
|
|
from polars import Expr, Series
|
|
from polars._typing import (
|
|
IntoExpr,
|
|
IntoExprColumn,
|
|
ListToStructWidthStrategy,
|
|
NullBehavior,
|
|
)
|
|
|
|
|
|
class ExprListNameSpace:
|
|
"""Namespace for list related expressions."""
|
|
|
|
_accessor = "list"
|
|
|
|
def __init__(self, expr: Expr) -> None:
|
|
self._pyexpr = expr._pyexpr
|
|
|
|
def __getitem__(self, item: int) -> Expr:
|
|
return self.get(item)
|
|
|
|
def all(self) -> Expr:
|
|
"""
|
|
Evaluate whether all boolean values in a list are true.
|
|
|
|
Notes
|
|
-----
|
|
If there are no non-null elements in a row, the output is `True`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {"a": [[True, True], [False, True], [False, False], [None], [], None]}
|
|
... )
|
|
>>> df.with_columns(all=pl.col("a").list.all())
|
|
shape: (6, 2)
|
|
┌────────────────┬───────┐
|
|
│ a ┆ all │
|
|
│ --- ┆ --- │
|
|
│ list[bool] ┆ bool │
|
|
╞════════════════╪═══════╡
|
|
│ [true, true] ┆ true │
|
|
│ [false, true] ┆ false │
|
|
│ [false, false] ┆ false │
|
|
│ [null] ┆ true │
|
|
│ [] ┆ true │
|
|
│ null ┆ null │
|
|
└────────────────┴───────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_all())
|
|
|
|
def any(self) -> Expr:
|
|
"""
|
|
Evaluate whether any boolean value in a list is true.
|
|
|
|
Notes
|
|
-----
|
|
If there are no non-null elements in a row, the output is `False`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {"a": [[True, True], [False, True], [False, False], [None], [], None]}
|
|
... )
|
|
>>> df.with_columns(any=pl.col("a").list.any())
|
|
shape: (6, 2)
|
|
┌────────────────┬───────┐
|
|
│ a ┆ any │
|
|
│ --- ┆ --- │
|
|
│ list[bool] ┆ bool │
|
|
╞════════════════╪═══════╡
|
|
│ [true, true] ┆ true │
|
|
│ [false, true] ┆ true │
|
|
│ [false, false] ┆ false │
|
|
│ [null] ┆ false │
|
|
│ [] ┆ false │
|
|
│ null ┆ null │
|
|
└────────────────┴───────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_any())
|
|
|
|
def len(self) -> Expr:
|
|
"""
|
|
Return the number of elements in each list.
|
|
|
|
Null values count towards the total.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`UInt32`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[1, 2, None], [5]]})
|
|
>>> df.with_columns(len=pl.col("a").list.len())
|
|
shape: (2, 2)
|
|
┌──────────────┬─────┐
|
|
│ a ┆ len │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ u32 │
|
|
╞══════════════╪═════╡
|
|
│ [1, 2, null] ┆ 3 │
|
|
│ [5] ┆ 1 │
|
|
└──────────────┴─────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_len())
|
|
|
|
def drop_nulls(self) -> Expr:
|
|
"""
|
|
Drop all null values in the list.
|
|
|
|
The original order of the remaining elements is preserved.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[None, 1, None, 2], [None], [3, 4]]})
|
|
>>> df.with_columns(drop_nulls=pl.col("values").list.drop_nulls())
|
|
shape: (3, 2)
|
|
┌────────────────┬────────────┐
|
|
│ values ┆ drop_nulls │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞════════════════╪════════════╡
|
|
│ [null, 1, … 2] ┆ [1, 2] │
|
|
│ [null] ┆ [] │
|
|
│ [3, 4] ┆ [3, 4] │
|
|
└────────────────┴────────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_drop_nulls())
|
|
|
|
def sample(
|
|
self,
|
|
n: int | IntoExprColumn | None = None,
|
|
*,
|
|
fraction: float | IntoExprColumn | None = None,
|
|
with_replacement: bool = False,
|
|
shuffle: bool = False,
|
|
seed: int | None = None,
|
|
) -> Expr:
|
|
"""
|
|
Sample from this list.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of items to return. Cannot be used with `fraction`. Defaults to 1 if
|
|
`fraction` is None.
|
|
fraction
|
|
Fraction of items to return. Cannot be used with `n`.
|
|
with_replacement
|
|
Allow values to be sampled more than once.
|
|
shuffle
|
|
Shuffle the order of sampled data points.
|
|
seed
|
|
Seed for the random number generator. If set to None (default), a
|
|
random seed is generated for each sample operation.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[1, 2, 3], [4, 5]], "n": [2, 1]})
|
|
>>> df.with_columns(sample=pl.col("values").list.sample(n=pl.col("n"), seed=1))
|
|
shape: (2, 3)
|
|
┌───────────┬─────┬───────────┐
|
|
│ values ┆ n ┆ sample │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[i64] ┆ i64 ┆ list[i64] │
|
|
╞═══════════╪═════╪═══════════╡
|
|
│ [1, 2, 3] ┆ 2 ┆ [2, 3] │
|
|
│ [4, 5] ┆ 1 ┆ [5] │
|
|
└───────────┴─────┴───────────┘
|
|
"""
|
|
if n is not None and fraction is not None:
|
|
msg = "cannot specify both `n` and `fraction`"
|
|
raise ValueError(msg)
|
|
|
|
if fraction is not None:
|
|
fraction_pyexpr = parse_into_expression(fraction)
|
|
return wrap_expr(
|
|
self._pyexpr.list_sample_fraction(
|
|
fraction_pyexpr, with_replacement, shuffle, seed
|
|
)
|
|
)
|
|
|
|
if n is None:
|
|
n = 1
|
|
n_pyexpr = parse_into_expression(n)
|
|
return wrap_expr(
|
|
self._pyexpr.list_sample_n(n_pyexpr, with_replacement, shuffle, seed)
|
|
)
|
|
|
|
def sum(self) -> Expr:
|
|
"""
|
|
Sum all the lists in the array.
|
|
|
|
Notes
|
|
-----
|
|
If there are no non-null elements in a row, the output is `0`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[1], [2, 3]]})
|
|
>>> df.with_columns(sum=pl.col("values").list.sum())
|
|
shape: (2, 2)
|
|
┌───────────┬─────┐
|
|
│ values ┆ sum │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪═════╡
|
|
│ [1] ┆ 1 │
|
|
│ [2, 3] ┆ 5 │
|
|
└───────────┴─────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_sum())
|
|
|
|
def max(self) -> Expr:
|
|
"""
|
|
Compute the max value of the lists in the array.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[1], [2, 3]]})
|
|
>>> df.with_columns(max=pl.col("values").list.max())
|
|
shape: (2, 2)
|
|
┌───────────┬─────┐
|
|
│ values ┆ max │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪═════╡
|
|
│ [1] ┆ 1 │
|
|
│ [2, 3] ┆ 3 │
|
|
└───────────┴─────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_max())
|
|
|
|
def min(self) -> Expr:
|
|
"""
|
|
Compute the min value of the lists in the array.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[1], [2, 3]]})
|
|
>>> df.with_columns(min=pl.col("values").list.min())
|
|
shape: (2, 2)
|
|
┌───────────┬─────┐
|
|
│ values ┆ min │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪═════╡
|
|
│ [1] ┆ 1 │
|
|
│ [2, 3] ┆ 2 │
|
|
└───────────┴─────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_min())
|
|
|
|
def mean(self) -> Expr:
|
|
"""
|
|
Compute the mean value of the lists in the array.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[1], [2, 3]]})
|
|
>>> df.with_columns(mean=pl.col("values").list.mean())
|
|
shape: (2, 2)
|
|
┌───────────┬──────┐
|
|
│ values ┆ mean │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ f64 │
|
|
╞═══════════╪══════╡
|
|
│ [1] ┆ 1.0 │
|
|
│ [2, 3] ┆ 2.5 │
|
|
└───────────┴──────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_mean())
|
|
|
|
def median(self) -> Expr:
|
|
"""
|
|
Compute the median value of the lists in the array.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[-1, 0, 1], [1, 10]]})
|
|
>>> df.with_columns(pl.col("values").list.median().alias("median"))
|
|
shape: (2, 2)
|
|
┌────────────┬────────┐
|
|
│ values ┆ median │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ f64 │
|
|
╞════════════╪════════╡
|
|
│ [-1, 0, 1] ┆ 0.0 │
|
|
│ [1, 10] ┆ 5.5 │
|
|
└────────────┴────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_median())
|
|
|
|
def std(self, ddof: int = 1) -> Expr:
|
|
"""
|
|
Compute the std value of the lists in the array.
|
|
|
|
Parameters
|
|
----------
|
|
ddof
|
|
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
|
|
where N represents the number of elements.
|
|
By default ddof is 1.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[-1, 0, 1], [1, 10]]})
|
|
>>> df.with_columns(pl.col("values").list.std().alias("std"))
|
|
shape: (2, 2)
|
|
┌────────────┬──────────┐
|
|
│ values ┆ std │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ f64 │
|
|
╞════════════╪══════════╡
|
|
│ [-1, 0, 1] ┆ 1.0 │
|
|
│ [1, 10] ┆ 6.363961 │
|
|
└────────────┴──────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_std(ddof))
|
|
|
|
def var(self, ddof: int = 1) -> Expr:
|
|
"""
|
|
Compute the var value of the lists in the array.
|
|
|
|
Parameters
|
|
----------
|
|
ddof
|
|
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
|
|
where N represents the number of elements.
|
|
By default ddof is 1.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"values": [[-1, 0, 1], [1, 10]]})
|
|
>>> df.with_columns(pl.col("values").list.var().alias("var"))
|
|
shape: (2, 2)
|
|
┌────────────┬──────┐
|
|
│ values ┆ var │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ f64 │
|
|
╞════════════╪══════╡
|
|
│ [-1, 0, 1] ┆ 1.0 │
|
|
│ [1, 10] ┆ 40.5 │
|
|
└────────────┴──────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_var(ddof))
|
|
|
|
def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr:
|
|
"""
|
|
Sort the lists in this column.
|
|
|
|
Parameters
|
|
----------
|
|
descending
|
|
Sort in descending order.
|
|
nulls_last
|
|
Place null values last.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[3, 2, 1], [9, 1, 2]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(sort=pl.col("a").list.sort())
|
|
shape: (2, 2)
|
|
┌───────────┬───────────┐
|
|
│ a ┆ sort │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═══════════╪═══════════╡
|
|
│ [3, 2, 1] ┆ [1, 2, 3] │
|
|
│ [9, 1, 2] ┆ [1, 2, 9] │
|
|
└───────────┴───────────┘
|
|
>>> df.with_columns(sort=pl.col("a").list.sort(descending=True))
|
|
shape: (2, 2)
|
|
┌───────────┬───────────┐
|
|
│ a ┆ sort │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═══════════╪═══════════╡
|
|
│ [3, 2, 1] ┆ [3, 2, 1] │
|
|
│ [9, 1, 2] ┆ [9, 2, 1] │
|
|
└───────────┴───────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_sort(descending, nulls_last))
|
|
|
|
def reverse(self) -> Expr:
|
|
"""
|
|
Reverse the arrays in the list.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[3, 2, 1], [9, 1, 2]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(reverse=pl.col("a").list.reverse())
|
|
shape: (2, 2)
|
|
┌───────────┬───────────┐
|
|
│ a ┆ reverse │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═══════════╪═══════════╡
|
|
│ [3, 2, 1] ┆ [1, 2, 3] │
|
|
│ [9, 1, 2] ┆ [2, 1, 9] │
|
|
└───────────┴───────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_reverse())
|
|
|
|
def unique(self, *, maintain_order: bool = False) -> Expr:
|
|
"""
|
|
Get the unique/distinct values in the list.
|
|
|
|
Parameters
|
|
----------
|
|
maintain_order
|
|
Maintain order of data. This requires more work.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 1, 2]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(unique=pl.col("a").list.unique())
|
|
shape: (1, 2)
|
|
┌───────────┬───────────┐
|
|
│ a ┆ unique │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═══════════╪═══════════╡
|
|
│ [1, 1, 2] ┆ [1, 2] │
|
|
└───────────┴───────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_unique(maintain_order))
|
|
|
|
def n_unique(self) -> Expr:
|
|
"""
|
|
Count the number of unique values in every sub-lists.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 1, 2], [2, 3, 4]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(n_unique=pl.col("a").list.n_unique())
|
|
shape: (2, 2)
|
|
┌───────────┬──────────┐
|
|
│ a ┆ n_unique │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ u32 │
|
|
╞═══════════╪══════════╡
|
|
│ [1, 1, 2] ┆ 2 │
|
|
│ [2, 3, 4] ┆ 3 │
|
|
└───────────┴──────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_n_unique())
|
|
|
|
def concat(self, other: list[Expr | str] | Expr | str | Series | list[Any]) -> Expr:
|
|
"""
|
|
Concat the arrays in a Series dtype List in linear time.
|
|
|
|
Parameters
|
|
----------
|
|
other
|
|
Columns to concat into a List Series
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [["a"], ["x"]],
|
|
... "b": [["b", "c"], ["y", "z"]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(concat=pl.col("a").list.concat("b"))
|
|
shape: (2, 3)
|
|
┌───────────┬────────────┬─────────────────┐
|
|
│ a ┆ b ┆ concat │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[str] ┆ list[str] ┆ list[str] │
|
|
╞═══════════╪════════════╪═════════════════╡
|
|
│ ["a"] ┆ ["b", "c"] ┆ ["a", "b", "c"] │
|
|
│ ["x"] ┆ ["y", "z"] ┆ ["x", "y", "z"] │
|
|
└───────────┴────────────┴─────────────────┘
|
|
"""
|
|
if isinstance(other, list) and (
|
|
not isinstance(other[0], (pl.Expr, str, pl.Series))
|
|
):
|
|
return self.concat(pl.Series([other]))
|
|
|
|
other_list: list[Expr | str | Series]
|
|
other_list = [other] if not isinstance(other, list) else copy.copy(other) # type: ignore[arg-type]
|
|
|
|
other_list.insert(0, wrap_expr(self._pyexpr))
|
|
return F.concat_list(other_list)
|
|
|
|
def get(
|
|
self,
|
|
index: int | Expr | str,
|
|
*,
|
|
null_on_oob: bool = False,
|
|
) -> Expr:
|
|
"""
|
|
Get the value by index in the sublists.
|
|
|
|
So index `0` would return the first item of every sublist
|
|
and index `-1` would return the last item of every sublist
|
|
if an index is out of bounds, it will return a `None`.
|
|
|
|
Parameters
|
|
----------
|
|
index
|
|
Index to return per sublist
|
|
null_on_oob
|
|
Behavior if an index is out of bounds:
|
|
|
|
* True -> set as null
|
|
* False -> raise an error
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[3, 2, 1], [], [1, 2]]})
|
|
>>> df.with_columns(get=pl.col("a").list.get(0, null_on_oob=True))
|
|
shape: (3, 2)
|
|
┌───────────┬──────┐
|
|
│ a ┆ get │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪══════╡
|
|
│ [3, 2, 1] ┆ 3 │
|
|
│ [] ┆ null │
|
|
│ [1, 2] ┆ 1 │
|
|
└───────────┴──────┘
|
|
"""
|
|
index_pyexpr = parse_into_expression(index)
|
|
return wrap_expr(self._pyexpr.list_get(index_pyexpr, null_on_oob))
|
|
|
|
def gather(
|
|
self,
|
|
indices: Expr | Series | list[int] | list[list[int]],
|
|
*,
|
|
null_on_oob: bool = False,
|
|
) -> Expr:
|
|
"""
|
|
Take sublists by multiple indices.
|
|
|
|
The indices may be defined in a single column, or by sublists in another
|
|
column of dtype `List`.
|
|
|
|
Parameters
|
|
----------
|
|
indices
|
|
Indices to return per sublist
|
|
null_on_oob
|
|
Behavior if an index is out of bounds:
|
|
True -> set as null
|
|
False -> raise an error
|
|
Note that defaulting to raising an error is much cheaper
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[3, 2, 1], [], [1, 2, 3, 4, 5]]})
|
|
>>> df.with_columns(gather=pl.col("a").list.gather([0, 4], null_on_oob=True))
|
|
shape: (3, 2)
|
|
┌─────────────┬──────────────┐
|
|
│ a ┆ gather │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪══════════════╡
|
|
│ [3, 2, 1] ┆ [3, null] │
|
|
│ [] ┆ [null, null] │
|
|
│ [1, 2, … 5] ┆ [1, 5] │
|
|
└─────────────┴──────────────┘
|
|
"""
|
|
indices_pyexpr = parse_into_expression(indices)
|
|
return wrap_expr(self._pyexpr.list_gather(indices_pyexpr, null_on_oob))
|
|
|
|
def gather_every(
|
|
self,
|
|
n: int | IntoExprColumn,
|
|
offset: int | IntoExprColumn = 0,
|
|
) -> Expr:
|
|
"""
|
|
Take every n-th value start from offset in sublists.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Gather every n-th element.
|
|
offset
|
|
Starting index.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2, 3, 4, 5], [6, 7, 8], [9, 10, 11, 12]],
|
|
... "n": [2, 1, 3],
|
|
... "offset": [0, 1, 0],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(
|
|
... gather_every=pl.col("a").list.gather_every(
|
|
... n=pl.col("n"), offset=pl.col("offset")
|
|
... )
|
|
... )
|
|
shape: (3, 4)
|
|
┌───────────────┬─────┬────────┬──────────────┐
|
|
│ a ┆ n ┆ offset ┆ gather_every │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ list[i64] ┆ i64 ┆ i64 ┆ list[i64] │
|
|
╞═══════════════╪═════╪════════╪══════════════╡
|
|
│ [1, 2, … 5] ┆ 2 ┆ 0 ┆ [1, 3, 5] │
|
|
│ [6, 7, 8] ┆ 1 ┆ 1 ┆ [7, 8] │
|
|
│ [9, 10, … 12] ┆ 3 ┆ 0 ┆ [9, 12] │
|
|
└───────────────┴─────┴────────┴──────────────┘
|
|
"""
|
|
n_pyexpr = parse_into_expression(n)
|
|
offset_pyexpr = parse_into_expression(offset)
|
|
return wrap_expr(self._pyexpr.list_gather_every(n_pyexpr, offset_pyexpr))
|
|
|
|
def first(self) -> Expr:
|
|
"""
|
|
Get the first value of the sublists.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[3, 2, 1], [], [1, 2]]})
|
|
>>> df.with_columns(first=pl.col("a").list.first())
|
|
shape: (3, 2)
|
|
┌───────────┬───────┐
|
|
│ a ┆ first │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪═══════╡
|
|
│ [3, 2, 1] ┆ 3 │
|
|
│ [] ┆ null │
|
|
│ [1, 2] ┆ 1 │
|
|
└───────────┴───────┘
|
|
"""
|
|
return self.get(0, null_on_oob=True)
|
|
|
|
def last(self) -> Expr:
|
|
"""
|
|
Get the last value of the sublists.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[3, 2, 1], [], [1, 2]]})
|
|
>>> df.with_columns(last=pl.col("a").list.last())
|
|
shape: (3, 2)
|
|
┌───────────┬──────┐
|
|
│ a ┆ last │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪══════╡
|
|
│ [3, 2, 1] ┆ 1 │
|
|
│ [] ┆ null │
|
|
│ [1, 2] ┆ 2 │
|
|
└───────────┴──────┘
|
|
"""
|
|
return self.get(-1, null_on_oob=True)
|
|
|
|
@unstable()
|
|
def item(self, *, allow_empty: bool = False) -> Expr:
|
|
"""
|
|
Get the single value of the sublists.
|
|
|
|
This errors if the sublist length is not exactly one.
|
|
|
|
Parameters
|
|
----------
|
|
allow_empty
|
|
Allow having no values to return `null`.
|
|
|
|
See Also
|
|
--------
|
|
:meth:`Expr.list.get` : Get the value by index in the sublists.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[3], [1], [2]]})
|
|
>>> df.with_columns(item=pl.col("a").list.item())
|
|
shape: (3, 2)
|
|
┌───────────┬──────┐
|
|
│ a ┆ item │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ i64 │
|
|
╞═══════════╪══════╡
|
|
│ [3] ┆ 3 │
|
|
│ [1] ┆ 1 │
|
|
│ [2] ┆ 2 │
|
|
└───────────┴──────┘
|
|
>>> df = pl.DataFrame({"a": [[3, 2, 1], [1], [2]]})
|
|
>>> df.select(pl.col("a").list.item())
|
|
Traceback (most recent call last):
|
|
...
|
|
polars.exceptions.ComputeError: aggregation 'item' expected a single value, got 3 values
|
|
>>> df = pl.DataFrame({"a": [[], [1], [2]]})
|
|
>>> df.select(pl.col("a").list.item(allow_empty=True))
|
|
shape: (3, 1)
|
|
┌──────┐
|
|
│ a │
|
|
│ --- │
|
|
│ i64 │
|
|
╞══════╡
|
|
│ null │
|
|
│ 1 │
|
|
│ 2 │
|
|
└──────┘
|
|
""" # noqa: W505
|
|
return self.agg(F.element().item(allow_empty=allow_empty))
|
|
|
|
def contains(self, item: IntoExpr, *, nulls_equal: bool = True) -> Expr:
|
|
"""
|
|
Check if sublists contain the given item.
|
|
|
|
Parameters
|
|
----------
|
|
item
|
|
Item that will be checked for membership
|
|
nulls_equal : bool, default True
|
|
If True, treat null as a distinct value. Null values will not propagate.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`Boolean`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[3, 2, 1], [], [1, 2]]})
|
|
>>> df.with_columns(contains=pl.col("a").list.contains(1))
|
|
shape: (3, 2)
|
|
┌───────────┬──────────┐
|
|
│ a ┆ contains │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ bool │
|
|
╞═══════════╪══════════╡
|
|
│ [3, 2, 1] ┆ true │
|
|
│ [] ┆ false │
|
|
│ [1, 2] ┆ true │
|
|
└───────────┴──────────┘
|
|
"""
|
|
item_pyexpr = parse_into_expression(item, str_as_lit=True)
|
|
return wrap_expr(self._pyexpr.list_contains(item_pyexpr, nulls_equal))
|
|
|
|
def join(self, separator: IntoExprColumn, *, ignore_nulls: bool = True) -> Expr:
|
|
"""
|
|
Join all string items in a sublist and place a separator between them.
|
|
|
|
This errors if inner type of list `!= String`.
|
|
|
|
Parameters
|
|
----------
|
|
separator
|
|
string to separate the items with
|
|
ignore_nulls
|
|
Ignore null values (default).
|
|
|
|
If set to ``False``, null values will be propagated.
|
|
If the sub-list contains any null values, the output is ``None``.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`String`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"s": [["a", "b", "c"], ["x", "y"]]})
|
|
>>> df.with_columns(join=pl.col("s").list.join(" "))
|
|
shape: (2, 2)
|
|
┌─────────────────┬───────┐
|
|
│ s ┆ join │
|
|
│ --- ┆ --- │
|
|
│ list[str] ┆ str │
|
|
╞═════════════════╪═══════╡
|
|
│ ["a", "b", "c"] ┆ a b c │
|
|
│ ["x", "y"] ┆ x y │
|
|
└─────────────────┴───────┘
|
|
|
|
>>> df = pl.DataFrame(
|
|
... {"s": [["a", "b", "c"], ["x", "y"]], "separator": ["*", "_"]}
|
|
... )
|
|
>>> df.with_columns(join=pl.col("s").list.join(pl.col("separator")))
|
|
shape: (2, 3)
|
|
┌─────────────────┬───────────┬───────┐
|
|
│ s ┆ separator ┆ join │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[str] ┆ str ┆ str │
|
|
╞═════════════════╪═══════════╪═══════╡
|
|
│ ["a", "b", "c"] ┆ * ┆ a*b*c │
|
|
│ ["x", "y"] ┆ _ ┆ x_y │
|
|
└─────────────────┴───────────┴───────┘
|
|
"""
|
|
separator_pyexpr = parse_into_expression(separator, str_as_lit=True)
|
|
return wrap_expr(self._pyexpr.list_join(separator_pyexpr, ignore_nulls))
|
|
|
|
def arg_min(self) -> Expr:
|
|
"""
|
|
Retrieve the index of the minimal value in every sublist.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`UInt32` or :class:`UInt64`
|
|
(depending on compilation).
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2], [2, 1]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(arg_min=pl.col("a").list.arg_min())
|
|
shape: (2, 2)
|
|
┌───────────┬─────────┐
|
|
│ a ┆ arg_min │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ u32 │
|
|
╞═══════════╪═════════╡
|
|
│ [1, 2] ┆ 0 │
|
|
│ [2, 1] ┆ 1 │
|
|
└───────────┴─────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_arg_min())
|
|
|
|
def arg_max(self) -> Expr:
|
|
"""
|
|
Retrieve the index of the maximum value in every sublist.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`UInt32` or :class:`UInt64`
|
|
(depending on compilation).
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2], [2, 1]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(arg_max=pl.col("a").list.arg_max())
|
|
shape: (2, 2)
|
|
┌───────────┬─────────┐
|
|
│ a ┆ arg_max │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ u32 │
|
|
╞═══════════╪═════════╡
|
|
│ [1, 2] ┆ 1 │
|
|
│ [2, 1] ┆ 0 │
|
|
└───────────┴─────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_arg_max())
|
|
|
|
def diff(self, n: int = 1, null_behavior: NullBehavior = "ignore") -> Expr:
|
|
"""
|
|
Calculate the first discrete difference between shifted items of every sublist.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of slots to shift.
|
|
null_behavior : {'ignore', 'drop'}
|
|
How to handle null values.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"n": [[1, 2, 3, 4], [10, 2, 1]]})
|
|
>>> df.with_columns(diff=pl.col("n").list.diff())
|
|
shape: (2, 2)
|
|
┌─────────────┬────────────────┐
|
|
│ n ┆ diff │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪════════════════╡
|
|
│ [1, 2, … 4] ┆ [null, 1, … 1] │
|
|
│ [10, 2, 1] ┆ [null, -8, -1] │
|
|
└─────────────┴────────────────┘
|
|
|
|
>>> df.with_columns(diff=pl.col("n").list.diff(n=2))
|
|
shape: (2, 2)
|
|
┌─────────────┬───────────────────┐
|
|
│ n ┆ diff │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪═══════════════════╡
|
|
│ [1, 2, … 4] ┆ [null, null, … 2] │
|
|
│ [10, 2, 1] ┆ [null, null, -9] │
|
|
└─────────────┴───────────────────┘
|
|
|
|
>>> df.with_columns(diff=pl.col("n").list.diff(n=2, null_behavior="drop"))
|
|
shape: (2, 2)
|
|
┌─────────────┬───────────┐
|
|
│ n ┆ diff │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪═══════════╡
|
|
│ [1, 2, … 4] ┆ [2, 2] │
|
|
│ [10, 2, 1] ┆ [-9] │
|
|
└─────────────┴───────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_diff(n, null_behavior))
|
|
|
|
def shift(self, n: int | IntoExprColumn = 1) -> Expr:
|
|
"""
|
|
Shift list values by the given number of indices.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of indices to shift forward. If a negative value is passed, values
|
|
are shifted in the opposite direction instead.
|
|
|
|
Notes
|
|
-----
|
|
This method is similar to the `LAG` operation in SQL when the value for `n`
|
|
is positive. With a negative value for `n`, it is similar to `LEAD`.
|
|
|
|
Examples
|
|
--------
|
|
By default, list values are shifted forward by one index.
|
|
|
|
>>> df = pl.DataFrame({"a": [[1, 2, 3], [4, 5]]})
|
|
>>> df.with_columns(shift=pl.col("a").list.shift())
|
|
shape: (2, 2)
|
|
┌───────────┬──────────────┐
|
|
│ a ┆ shift │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═══════════╪══════════════╡
|
|
│ [1, 2, 3] ┆ [null, 1, 2] │
|
|
│ [4, 5] ┆ [null, 4] │
|
|
└───────────┴──────────────┘
|
|
|
|
Pass a negative value to shift in the opposite direction instead.
|
|
|
|
>>> df.with_columns(shift=pl.col("a").list.shift(-2))
|
|
shape: (2, 2)
|
|
┌───────────┬─────────────────┐
|
|
│ a ┆ shift │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═══════════╪═════════════════╡
|
|
│ [1, 2, 3] ┆ [3, null, null] │
|
|
│ [4, 5] ┆ [null, null] │
|
|
└───────────┴─────────────────┘
|
|
"""
|
|
n_pyexpr = parse_into_expression(n)
|
|
return wrap_expr(self._pyexpr.list_shift(n_pyexpr))
|
|
|
|
def slice(
|
|
self, offset: int | str | Expr, length: int | str | Expr | None = None
|
|
) -> Expr:
|
|
"""
|
|
Slice every sublist.
|
|
|
|
Parameters
|
|
----------
|
|
offset
|
|
Start index. Negative indexing is supported.
|
|
length
|
|
Length of the slice. If set to `None` (default), the slice is taken to the
|
|
end of the list.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[1, 2, 3, 4], [10, 2, 1]]})
|
|
>>> df.with_columns(slice=pl.col("a").list.slice(1, 2))
|
|
shape: (2, 2)
|
|
┌─────────────┬───────────┐
|
|
│ a ┆ slice │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪═══════════╡
|
|
│ [1, 2, … 4] ┆ [2, 3] │
|
|
│ [10, 2, 1] ┆ [2, 1] │
|
|
└─────────────┴───────────┘
|
|
"""
|
|
offset_pyexpr = parse_into_expression(offset)
|
|
length_pyexpr = parse_into_expression(length)
|
|
return wrap_expr(self._pyexpr.list_slice(offset_pyexpr, length_pyexpr))
|
|
|
|
def head(self, n: int | str | Expr = 5) -> Expr:
|
|
"""
|
|
Slice the first `n` values of every sublist.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of values to return for each sublist.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[1, 2, 3, 4], [10, 2, 1]]})
|
|
>>> df.with_columns(head=pl.col("a").list.head(2))
|
|
shape: (2, 2)
|
|
┌─────────────┬───────────┐
|
|
│ a ┆ head │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪═══════════╡
|
|
│ [1, 2, … 4] ┆ [1, 2] │
|
|
│ [10, 2, 1] ┆ [10, 2] │
|
|
└─────────────┴───────────┘
|
|
"""
|
|
return self.slice(0, n)
|
|
|
|
def tail(self, n: int | str | Expr = 5) -> Expr:
|
|
"""
|
|
Slice the last `n` values of every sublist.
|
|
|
|
Parameters
|
|
----------
|
|
n
|
|
Number of values to return for each sublist.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[1, 2, 3, 4], [10, 2, 1]]})
|
|
>>> df.with_columns(tail=pl.col("a").list.tail(2))
|
|
shape: (2, 2)
|
|
┌─────────────┬───────────┐
|
|
│ a ┆ tail │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞═════════════╪═══════════╡
|
|
│ [1, 2, … 4] ┆ [3, 4] │
|
|
│ [10, 2, 1] ┆ [2, 1] │
|
|
└─────────────┴───────────┘
|
|
"""
|
|
n_pyexpr = parse_into_expression(n)
|
|
return wrap_expr(self._pyexpr.list_tail(n_pyexpr))
|
|
|
|
def explode(self) -> Expr:
|
|
"""
|
|
Returns a column with a separate row for every list element.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression with the data type of the list elements.
|
|
|
|
See Also
|
|
--------
|
|
Expr.reshape: Reshape this Expr to a flat Series or a Series of Lists.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[1, 2, 3], [4, 5, 6]]})
|
|
>>> df.select(pl.col("a").list.explode())
|
|
shape: (6, 1)
|
|
┌─────┐
|
|
│ a │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 1 │
|
|
│ 2 │
|
|
│ 3 │
|
|
│ 4 │
|
|
│ 5 │
|
|
│ 6 │
|
|
└─────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.explode())
|
|
|
|
def count_matches(self, element: IntoExpr) -> Expr:
|
|
"""
|
|
Count how often the value produced by `element` occurs.
|
|
|
|
Parameters
|
|
----------
|
|
element
|
|
An expression that produces a single value
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[0], [1], [1, 2, 3, 2], [1, 2, 1], [4, 4]]})
|
|
>>> df.with_columns(number_of_twos=pl.col("a").list.count_matches(2))
|
|
shape: (5, 2)
|
|
┌─────────────┬────────────────┐
|
|
│ a ┆ number_of_twos │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ u32 │
|
|
╞═════════════╪════════════════╡
|
|
│ [0] ┆ 0 │
|
|
│ [1] ┆ 0 │
|
|
│ [1, 2, … 2] ┆ 2 │
|
|
│ [1, 2, 1] ┆ 1 │
|
|
│ [4, 4] ┆ 0 │
|
|
└─────────────┴────────────────┘
|
|
"""
|
|
element_pyexpr = parse_into_expression(element, str_as_lit=True)
|
|
return wrap_expr(self._pyexpr.list_count_matches(element_pyexpr))
|
|
|
|
def to_array(self, width: int) -> Expr:
|
|
"""
|
|
Convert a List column into an Array column with the same inner data type.
|
|
|
|
Parameters
|
|
----------
|
|
width
|
|
Width of the resulting Array column.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`Array`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... data={"a": [[1, 2], [3, 4]]},
|
|
... schema={"a": pl.List(pl.Int8)},
|
|
... )
|
|
>>> df.with_columns(array=pl.col("a").list.to_array(2))
|
|
shape: (2, 2)
|
|
┌──────────┬──────────────┐
|
|
│ a ┆ array │
|
|
│ --- ┆ --- │
|
|
│ list[i8] ┆ array[i8, 2] │
|
|
╞══════════╪══════════════╡
|
|
│ [1, 2] ┆ [1, 2] │
|
|
│ [3, 4] ┆ [3, 4] │
|
|
└──────────┴──────────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_to_array(width))
|
|
|
|
def to_struct(
|
|
self,
|
|
n_field_strategy: ListToStructWidthStrategy | None = None,
|
|
fields: Sequence[str] | Callable[[int], str] | None = None,
|
|
upper_bound: int | None = None,
|
|
) -> Expr:
|
|
"""
|
|
Convert the Series of type `List` to a Series of type `Struct`.
|
|
|
|
Parameters
|
|
----------
|
|
n_field_strategy : {'first_non_null', 'max_width'}
|
|
Deprecated and ignored.
|
|
fields
|
|
If the name and number of the desired fields is known in advance
|
|
a list of field names can be given, which will be assigned by index.
|
|
Otherwise, to dynamically assign field names, a custom function can be
|
|
used; if neither are set, fields will be `field_0, field_1 .. field_n`.
|
|
upper_bound
|
|
A polars expression needs to be able to evaluate the output datatype at all
|
|
times, so the caller must provide an upper bound of the number of struct
|
|
fields that will be created if `fields` is not a sequence of field names.
|
|
|
|
.. versionchanged:: 1.33.0
|
|
The `n_field_strategy` parameter is ignored and deprecated. The `fields`
|
|
needs to be a sequence of field names or the upper bound is regarded as
|
|
ground truth.
|
|
|
|
Examples
|
|
--------
|
|
Convert list to struct with default field name assignment:
|
|
|
|
>>> df = pl.DataFrame({"n": [[0, 1], [0, 1, 2]]})
|
|
>>> df.with_columns(
|
|
... struct=pl.col("n").list.to_struct(upper_bound=2)
|
|
... ) # doctest: +SKIP
|
|
shape: (2, 2)
|
|
┌───────────┬───────────┐
|
|
│ n ┆ struct │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ struct[2] │ # <- struct with 2 fields
|
|
╞═══════════╪═══════════╡
|
|
│ [0, 1] ┆ {0,1} │ # OK
|
|
│ [0, 1, 2] ┆ {0,1} │ # NOT OK - last value missing
|
|
└───────────┴───────────┘
|
|
|
|
Convert list to struct with field name assignment by function/index:
|
|
|
|
>>> df = pl.DataFrame({"n": [[0, 1], [2, 3]]})
|
|
>>> df.select(
|
|
... pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}", upper_bound=2)
|
|
... ).rows(named=True) # doctest: +SKIP
|
|
[{'n': {'n0': 0, 'n1': 1}}, {'n': {'n0': 2, 'n1': 3}}]
|
|
|
|
Convert list to struct with field name assignment by index from a list of names:
|
|
|
|
>>> df.select(pl.col("n").list.to_struct(fields=["one", "two"])).rows(
|
|
... named=True
|
|
... )
|
|
[{'n': {'one': 0, 'two': 1}}, {'n': {'one': 2, 'two': 3}}]
|
|
"""
|
|
if n_field_strategy is not None:
|
|
issue_warning(
|
|
"`Expr.list.to_struct` with `n_field_strategy` is deprecated and has no effect on execution.",
|
|
DeprecationWarning,
|
|
)
|
|
|
|
if not isinstance(fields, Sequence):
|
|
if upper_bound is None:
|
|
msg = "`Expr.list.to_struct` requires either `fields` to be a sequence or `upper_bound` to be set.\n\nThis used to be allowed but produced unpredictable results."
|
|
raise exceptions.InvalidOperationError(msg)
|
|
|
|
if fields is None:
|
|
fields = [f"field_{i}" for i in range(upper_bound)]
|
|
else:
|
|
fields = [fields(i) for i in range(upper_bound)]
|
|
|
|
return wrap_expr(self._pyexpr.list_to_struct(fields))
|
|
|
|
def eval(self, expr: Expr, *, parallel: bool = False) -> Expr:
|
|
"""
|
|
Run any polars expression against the lists' elements.
|
|
|
|
Parameters
|
|
----------
|
|
expr
|
|
Expression to run. Note that you can select an element with `pl.element()`.
|
|
parallel
|
|
Run all expression parallel. Don't activate this blindly.
|
|
Parallelism is worth it if there is enough work to do per thread.
|
|
|
|
This likely should not be used in the group by context, because we already
|
|
parallel execution per group
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
|
|
>>> df.with_columns(
|
|
... rank=pl.concat_list("a", "b").list.eval(pl.element().rank())
|
|
... )
|
|
shape: (3, 3)
|
|
┌─────┬─────┬────────────┐
|
|
│ a ┆ b ┆ rank │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ list[f64] │
|
|
╞═════╪═════╪════════════╡
|
|
│ 1 ┆ 4 ┆ [1.0, 2.0] │
|
|
│ 8 ┆ 5 ┆ [2.0, 1.0] │
|
|
│ 3 ┆ 2 ┆ [2.0, 1.0] │
|
|
└─────┴─────┴────────────┘
|
|
|
|
See Also
|
|
--------
|
|
polars.Expr.list.agg: Evaluate any expression and automatically explode.
|
|
polars.Expr.arr.eval: Same for the Array datatype.
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_eval(expr._pyexpr, parallel))
|
|
|
|
def agg(self, expr: Expr) -> Expr:
|
|
"""
|
|
Run any polars aggregation expression against the lists' elements.
|
|
|
|
Parameters
|
|
----------
|
|
expr
|
|
Expression to run. Note that you can select an element with `pl.element()`.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame({"a": [[1, None], [42, 13], [None, None]]})
|
|
>>> df.with_columns(null_count=pl.col.a.list.agg(pl.element().null_count()))
|
|
shape: (3, 2)
|
|
┌──────────────┬────────────┐
|
|
│ a ┆ null_count │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ u32 │
|
|
╞══════════════╪════════════╡
|
|
│ [1, null] ┆ 1 │
|
|
│ [42, 13] ┆ 0 │
|
|
│ [null, null] ┆ 2 │
|
|
└──────────────┴────────────┘
|
|
>>> df.with_columns(no_nulls=pl.col.a.list.agg(pl.element().drop_nulls()))
|
|
shape: (3, 2)
|
|
┌──────────────┬───────────┐
|
|
│ a ┆ no_nulls │
|
|
│ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] │
|
|
╞══════════════╪═══════════╡
|
|
│ [1, null] ┆ [1] │
|
|
│ [42, 13] ┆ [42, 13] │
|
|
│ [null, null] ┆ [] │
|
|
└──────────────┴───────────┘
|
|
|
|
See Also
|
|
--------
|
|
polars.Expr.list.eval: Evaluates expressions without automatically exploding.
|
|
polars.Expr.arr.agg: Same for the Array datatype.
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_agg(expr._pyexpr))
|
|
|
|
def filter(self, predicate: Expr) -> Expr:
|
|
"""
|
|
Filter elements in each list by a boolean expression.
|
|
|
|
Parameters
|
|
----------
|
|
predicate
|
|
A boolean expression that is evaluated per list element.
|
|
You can refer to the current element with `pl.element()`.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars as pl
|
|
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
|
|
>>> df.with_columns(
|
|
... evens=pl.concat_list("a", "b").list.filter(pl.element() % 2 == 0)
|
|
... )
|
|
shape: (3, 3)
|
|
┌─────┬─────┬───────────┐
|
|
│ a ┆ b ┆ evens │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ list[i64] │
|
|
╞═════╪═════╪═══════════╡
|
|
│ 1 ┆ 4 ┆ [4] │
|
|
│ 8 ┆ 5 ┆ [8] │
|
|
│ 3 ┆ 2 ┆ [2] │
|
|
└─────┴─────┴───────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.list_filter(predicate._pyexpr))
|
|
|
|
def set_union(self, other: IntoExpr | Collection[Any]) -> Expr:
|
|
"""
|
|
Compute the SET UNION between the elements in this list and the elements of `other`.
|
|
|
|
Parameters
|
|
----------
|
|
other
|
|
Right hand side of the set operation.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2, 3], [], [None, 3], [5, 6, 7]],
|
|
... "b": [[2, 3, 4], [3], [3, 4, None], [6, 8]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(
|
|
... union=pl.col("a").list.set_union("b")
|
|
... ) # doctest: +IGNORE_RESULT
|
|
shape: (4, 3)
|
|
┌───────────┬──────────────┬───────────────┐
|
|
│ a ┆ b ┆ union │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] ┆ list[i64] │
|
|
╞═══════════╪══════════════╪═══════════════╡
|
|
│ [1, 2, 3] ┆ [2, 3, 4] ┆ [1, 2, 3, 4] │
|
|
│ [] ┆ [3] ┆ [3] │
|
|
│ [null, 3] ┆ [3, 4, null] ┆ [null, 3, 4] │
|
|
│ [5, 6, 7] ┆ [6, 8] ┆ [5, 6, 7, 8] │
|
|
└───────────┴──────────────┴───────────────┘
|
|
""" # noqa: W505
|
|
if isinstance(other, Collection) and not isinstance(other, str):
|
|
if not isinstance(other, (Sequence, pl.Series, pl.DataFrame)):
|
|
other = list(other) # eg: set, frozenset, etc
|
|
other_pyexpr = F.lit(other)._pyexpr
|
|
else:
|
|
other_pyexpr = parse_into_expression(other)
|
|
return wrap_expr(self._pyexpr.list_set_operation(other_pyexpr, "union"))
|
|
|
|
def set_difference(self, other: IntoExpr | Collection[Any]) -> Expr:
|
|
"""
|
|
Compute the SET DIFFERENCE between the elements in this list and the elements of `other`.
|
|
|
|
Parameters
|
|
----------
|
|
other
|
|
Right hand side of the set operation.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2, 3], [], [None, 3], [5, 6, 7]],
|
|
... "b": [[2, 3, 4], [3], [3, 4, None], [6, 8]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(difference=pl.col("a").list.set_difference("b"))
|
|
shape: (4, 3)
|
|
┌───────────┬──────────────┬────────────┐
|
|
│ a ┆ b ┆ difference │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] ┆ list[i64] │
|
|
╞═══════════╪══════════════╪════════════╡
|
|
│ [1, 2, 3] ┆ [2, 3, 4] ┆ [1] │
|
|
│ [] ┆ [3] ┆ [] │
|
|
│ [null, 3] ┆ [3, 4, null] ┆ [] │
|
|
│ [5, 6, 7] ┆ [6, 8] ┆ [5, 7] │
|
|
└───────────┴──────────────┴────────────┘
|
|
|
|
See Also
|
|
--------
|
|
polars.Expr.list.diff: Calculates the n-th discrete difference of every sublist.
|
|
""" # noqa: W505
|
|
if isinstance(other, Collection) and not isinstance(other, str):
|
|
if not isinstance(other, (Sequence, pl.Series, pl.DataFrame)):
|
|
other = list(other) # eg: set, frozenset, etc
|
|
other_pyexpr = F.lit(other)._pyexpr
|
|
else:
|
|
other_pyexpr = parse_into_expression(other)
|
|
return wrap_expr(self._pyexpr.list_set_operation(other_pyexpr, "difference"))
|
|
|
|
def set_intersection(self, other: IntoExpr | Collection[Any]) -> Expr:
|
|
"""
|
|
Compute the SET INTERSECTION between the elements in this list and the elements of `other`.
|
|
|
|
Parameters
|
|
----------
|
|
other
|
|
Right hand side of the set operation.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2, 3], [], [None, 3], [5, 6, 7]],
|
|
... "b": [[2, 3, 4], [3], [3, 4, None], [6, 8]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(intersection=pl.col("a").list.set_intersection("b"))
|
|
shape: (4, 3)
|
|
┌───────────┬──────────────┬──────────────┐
|
|
│ a ┆ b ┆ intersection │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] ┆ list[i64] │
|
|
╞═══════════╪══════════════╪══════════════╡
|
|
│ [1, 2, 3] ┆ [2, 3, 4] ┆ [2, 3] │
|
|
│ [] ┆ [3] ┆ [] │
|
|
│ [null, 3] ┆ [3, 4, null] ┆ [null, 3] │
|
|
│ [5, 6, 7] ┆ [6, 8] ┆ [6] │
|
|
└───────────┴──────────────┴──────────────┘
|
|
""" # noqa: W505
|
|
if isinstance(other, Collection) and not isinstance(other, str):
|
|
if not isinstance(other, (Sequence, pl.Series, pl.DataFrame)):
|
|
other = list(other) # eg: set, frozenset, etc
|
|
other_pyexpr = F.lit(other)._pyexpr
|
|
else:
|
|
other_pyexpr = parse_into_expression(other)
|
|
return wrap_expr(self._pyexpr.list_set_operation(other_pyexpr, "intersection"))
|
|
|
|
def set_symmetric_difference(self, other: IntoExpr | Collection[Any]) -> Expr:
|
|
"""
|
|
Compute the SET SYMMETRIC DIFFERENCE between the elements in this list and the elements of `other`.
|
|
|
|
Parameters
|
|
----------
|
|
other
|
|
Right hand side of the set operation.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "a": [[1, 2, 3], [], [None, 3], [5, 6, 7]],
|
|
... "b": [[2, 3, 4], [3], [3, 4, None], [6, 8]],
|
|
... }
|
|
... )
|
|
>>> df.with_columns(sdiff=pl.col("b").list.set_symmetric_difference("a"))
|
|
shape: (4, 3)
|
|
┌───────────┬──────────────┬───────────┐
|
|
│ a ┆ b ┆ sdiff │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ list[i64] ┆ list[i64] ┆ list[i64] │
|
|
╞═══════════╪══════════════╪═══════════╡
|
|
│ [1, 2, 3] ┆ [2, 3, 4] ┆ [4, 1] │
|
|
│ [] ┆ [3] ┆ [3] │
|
|
│ [null, 3] ┆ [3, 4, null] ┆ [4] │
|
|
│ [5, 6, 7] ┆ [6, 8] ┆ [8, 5, 7] │
|
|
└───────────┴──────────────┴───────────┘
|
|
""" # noqa: W505
|
|
if isinstance(other, Collection) and not isinstance(other, str):
|
|
if not isinstance(other, (Sequence, pl.Series, pl.DataFrame)):
|
|
other = list(other) # eg: set, frozenset, etc
|
|
other_pyexpr = F.lit(other)._pyexpr
|
|
else:
|
|
other_pyexpr = parse_into_expression(other)
|
|
return wrap_expr(
|
|
self._pyexpr.list_set_operation(other_pyexpr, "symmetric_difference")
|
|
)
|