307 lines
10 KiB
Python
307 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from polars._utils.various import qualified_type_name
|
|
from polars._utils.wrap import wrap_expr
|
|
|
|
if TYPE_CHECKING:
|
|
from polars import Expr
|
|
|
|
|
|
class ExprCatNameSpace:
|
|
"""Namespace for categorical related expressions."""
|
|
|
|
_accessor = "cat"
|
|
|
|
def __init__(self, expr: Expr) -> None:
|
|
self._pyexpr = expr._pyexpr
|
|
|
|
def get_categories(self) -> Expr:
|
|
"""
|
|
Get the categories stored in this data type.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.Series(
|
|
... "cats", ["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical
|
|
... ).to_frame()
|
|
>>> df.select(pl.col("cats").cat.get_categories()) # doctest: +SKIP
|
|
shape: (3, 1)
|
|
┌──────┐
|
|
│ cats │
|
|
│ --- │
|
|
│ str │
|
|
╞══════╡
|
|
│ foo │
|
|
│ bar │
|
|
│ ham │
|
|
└──────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.cat_get_categories())
|
|
|
|
def len_bytes(self) -> Expr:
|
|
"""
|
|
Return the byte-length of the string representation of each value.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`UInt32`.
|
|
|
|
See Also
|
|
--------
|
|
len_chars
|
|
|
|
Notes
|
|
-----
|
|
When working with non-ASCII text, the length in bytes is not the same as the
|
|
length in characters. You may want to use :func:`len_chars` instead.
|
|
Note that :func:`len_bytes` is much more performant (_O(1)_) than
|
|
:func:`len_chars` (_O(n)_).
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {"a": pl.Series(["Café", "345", "東京", None], dtype=pl.Categorical)}
|
|
... )
|
|
>>> df.with_columns(
|
|
... pl.col("a").cat.len_bytes().alias("n_bytes"),
|
|
... pl.col("a").cat.len_chars().alias("n_chars"),
|
|
... )
|
|
shape: (4, 3)
|
|
┌──────┬─────────┬─────────┐
|
|
│ a ┆ n_bytes ┆ n_chars │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ cat ┆ u32 ┆ u32 │
|
|
╞══════╪═════════╪═════════╡
|
|
│ Café ┆ 5 ┆ 4 │
|
|
│ 345 ┆ 3 ┆ 3 │
|
|
│ 東京 ┆ 6 ┆ 2 │
|
|
│ null ┆ null ┆ null │
|
|
└──────┴─────────┴─────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.cat_len_bytes())
|
|
|
|
def len_chars(self) -> Expr:
|
|
"""
|
|
Return the number of characters of the string representation of each value.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`UInt32`.
|
|
|
|
See Also
|
|
--------
|
|
len_bytes
|
|
|
|
Notes
|
|
-----
|
|
When working with ASCII text, use :func:`len_bytes` instead to achieve
|
|
equivalent output with much better performance:
|
|
:func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
|
|
|
|
A character is defined as a `Unicode scalar value`_. A single character is
|
|
represented by a single byte when working with ASCII text, and a maximum of
|
|
4 bytes otherwise.
|
|
|
|
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {"a": pl.Series(["Café", "345", "東京", None], dtype=pl.Categorical)}
|
|
... )
|
|
>>> df.with_columns(
|
|
... pl.col("a").cat.len_chars().alias("n_chars"),
|
|
... pl.col("a").cat.len_bytes().alias("n_bytes"),
|
|
... )
|
|
shape: (4, 3)
|
|
┌──────┬─────────┬─────────┐
|
|
│ a ┆ n_chars ┆ n_bytes │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ cat ┆ u32 ┆ u32 │
|
|
╞══════╪═════════╪═════════╡
|
|
│ Café ┆ 4 ┆ 5 │
|
|
│ 345 ┆ 3 ┆ 3 │
|
|
│ 東京 ┆ 2 ┆ 6 │
|
|
│ null ┆ null ┆ null │
|
|
└──────┴─────────┴─────────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.cat_len_chars())
|
|
|
|
def starts_with(self, prefix: str) -> Expr:
|
|
"""
|
|
Check if string representations of values start with a substring.
|
|
|
|
Parameters
|
|
----------
|
|
prefix
|
|
Prefix substring.
|
|
|
|
See Also
|
|
--------
|
|
contains : Check if string repr contains a substring that matches a pattern.
|
|
ends_with : Check if string repr end with a substring.
|
|
|
|
Notes
|
|
-----
|
|
Whereas `str.starts_with` allows expression inputs, `cat.starts_with` requires
|
|
a literal string value.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {"fruits": pl.Series(["apple", "mango", None], dtype=pl.Categorical)}
|
|
... )
|
|
>>> df.with_columns(
|
|
... pl.col("fruits").cat.starts_with("app").alias("has_prefix"),
|
|
... )
|
|
shape: (3, 2)
|
|
┌────────┬────────────┐
|
|
│ fruits ┆ has_prefix │
|
|
│ --- ┆ --- │
|
|
│ cat ┆ bool │
|
|
╞════════╪════════════╡
|
|
│ apple ┆ true │
|
|
│ mango ┆ false │
|
|
│ null ┆ null │
|
|
└────────┴────────────┘
|
|
|
|
Using `starts_with` as a filter condition:
|
|
|
|
>>> df.filter(pl.col("fruits").cat.starts_with("app"))
|
|
shape: (1, 1)
|
|
┌────────┐
|
|
│ fruits │
|
|
│ --- │
|
|
│ cat │
|
|
╞════════╡
|
|
│ apple │
|
|
└────────┘
|
|
"""
|
|
if not isinstance(prefix, str):
|
|
msg = f"'prefix' must be a string; found {qualified_type_name(prefix)!r}"
|
|
raise TypeError(msg)
|
|
return wrap_expr(self._pyexpr.cat_starts_with(prefix))
|
|
|
|
def ends_with(self, suffix: str) -> Expr:
|
|
"""
|
|
Check if string representations of values end with a substring.
|
|
|
|
Parameters
|
|
----------
|
|
suffix
|
|
Suffix substring.
|
|
|
|
See Also
|
|
--------
|
|
contains : Check if string reprs contains a substring that matches a pattern.
|
|
starts_with : Check if string reprs start with a substring.
|
|
|
|
Notes
|
|
-----
|
|
Whereas `str.ends_with` allows expression inputs, `cat.ends_with` requires a
|
|
literal string value.
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {"fruits": pl.Series(["apple", "mango", None], dtype=pl.Categorical)}
|
|
... )
|
|
>>> df.with_columns(pl.col("fruits").cat.ends_with("go").alias("has_suffix"))
|
|
shape: (3, 2)
|
|
┌────────┬────────────┐
|
|
│ fruits ┆ has_suffix │
|
|
│ --- ┆ --- │
|
|
│ cat ┆ bool │
|
|
╞════════╪════════════╡
|
|
│ apple ┆ false │
|
|
│ mango ┆ true │
|
|
│ null ┆ null │
|
|
└────────┴────────────┘
|
|
|
|
Using `ends_with` as a filter condition:
|
|
|
|
>>> df.filter(pl.col("fruits").cat.ends_with("go"))
|
|
shape: (1, 1)
|
|
┌────────┐
|
|
│ fruits │
|
|
│ --- │
|
|
│ cat │
|
|
╞════════╡
|
|
│ mango │
|
|
└────────┘
|
|
"""
|
|
if not isinstance(suffix, str):
|
|
msg = f"'suffix' must be a string; found {qualified_type_name(suffix)!r}"
|
|
raise TypeError(msg)
|
|
return wrap_expr(self._pyexpr.cat_ends_with(suffix))
|
|
|
|
def slice(self, offset: int, length: int | None = None) -> Expr:
|
|
"""
|
|
Extract a substring from the string representation of each value.
|
|
|
|
Parameters
|
|
----------
|
|
offset
|
|
Start index. Negative indexing is supported.
|
|
length
|
|
Length of the slice. If set to `None` (default), the slice is taken to the
|
|
end of the string.
|
|
|
|
Returns
|
|
-------
|
|
Expr
|
|
Expression of data type :class:`String`.
|
|
|
|
Notes
|
|
-----
|
|
Both the `offset` and `length` inputs are defined in terms of the number
|
|
of characters in the (UTF8) string. A character is defined as a
|
|
`Unicode scalar value`_. A single character is represented by a single byte
|
|
when working with ASCII text, and a maximum of 4 bytes otherwise.
|
|
|
|
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
|
|
Examples
|
|
--------
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "s": pl.Series(
|
|
... ["pear", None, "papaya", "dragonfruit"],
|
|
... dtype=pl.Categorical,
|
|
... )
|
|
... }
|
|
... )
|
|
>>> df.with_columns(pl.col("s").cat.slice(-3).alias("slice"))
|
|
shape: (4, 2)
|
|
┌─────────────┬───────┐
|
|
│ s ┆ slice │
|
|
│ --- ┆ --- │
|
|
│ cat ┆ str │
|
|
╞═════════════╪═══════╡
|
|
│ pear ┆ ear │
|
|
│ null ┆ null │
|
|
│ papaya ┆ aya │
|
|
│ dragonfruit ┆ uit │
|
|
└─────────────┴───────┘
|
|
|
|
Using the optional `length` parameter
|
|
|
|
>>> df.with_columns(pl.col("s").cat.slice(4, length=3).alias("slice"))
|
|
shape: (4, 2)
|
|
┌─────────────┬───────┐
|
|
│ s ┆ slice │
|
|
│ --- ┆ --- │
|
|
│ cat ┆ str │
|
|
╞═════════════╪═══════╡
|
|
│ pear ┆ │
|
|
│ null ┆ null │
|
|
│ papaya ┆ ya │
|
|
│ dragonfruit ┆ onf │
|
|
└─────────────┴───────┘
|
|
"""
|
|
return wrap_expr(self._pyexpr.cat_slice(offset, length))
|