DriverTrac/venv/lib/python3.12/site-packages/polars/series/categorical.py

247 lines
6.4 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
from polars._utils.unstable import unstable
from polars._utils.wrap import wrap_s
from polars.series.utils import expr_dispatch
if TYPE_CHECKING:
from polars import Series
from polars._plr import PySeries
@expr_dispatch
class CatNameSpace:
"""Namespace for categorical related series."""
_accessor = "cat"
def __init__(self, series: Series) -> None:
self._s: PySeries = series._s
def get_categories(self) -> Series:
"""
Get the categories stored in this data type.
Examples
--------
>>> s = pl.Series(["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical)
>>> s.cat.get_categories() # doctest: +SKIP
shape: (3,)
Series: '' [str]
[
"foo"
"bar"
"ham"
]
"""
def is_local(self) -> bool:
"""
Return whether or not the column is a local categorical.
Always returns false.
"""
return self._s.cat_is_local()
def to_local(self) -> Series:
"""Simply returns the column as-is, local representations are deprecated."""
return wrap_s(self._s.cat_to_local())
@unstable()
def uses_lexical_ordering(self) -> bool:
"""
Indicate whether the Series uses lexical ordering.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
Examples
--------
>>> s = pl.Series(["b", "a", "b"]).cast(pl.Categorical)
>>> s.cat.uses_lexical_ordering()
True
"""
return self._s.cat_uses_lexical_ordering()
def len_bytes(self) -> Series:
"""
Return the byte-length of the string representation of each value.
Returns
-------
Series
Series of data type :class:`UInt32`.
See Also
--------
len_chars
Notes
-----
When working with non-ASCII text, the length in bytes is not the same as the
length in characters. You may want to use :func:`len_chars` instead.
Note that :func:`len_bytes` is much more performant (_O(1)_) than
:func:`len_chars` (_O(n)_).
Examples
--------
>>> s = pl.Series(["Café", "345", "東京", None], dtype=pl.Categorical)
>>> s.cat.len_bytes()
shape: (4,)
Series: '' [u32]
[
5
3
6
null
]
"""
def len_chars(self) -> Series:
"""
Return the number of characters of the string representation of each value.
Returns
-------
Series
Series of data type :class:`UInt32`.
See Also
--------
len_bytes
Notes
-----
When working with ASCII text, use :func:`len_bytes` instead to achieve
equivalent output with much better performance:
:func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
A character is defined as a `Unicode scalar value`_. A single character is
represented by a single byte when working with ASCII text, and a maximum of
4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
Examples
--------
>>> s = pl.Series(["Café", "345", "東京", None], dtype=pl.Categorical)
>>> s.cat.len_chars()
shape: (4,)
Series: '' [u32]
[
4
3
2
null
]
"""
def starts_with(self, prefix: str) -> Series:
"""
Check if string representations of values start with a substring.
Parameters
----------
prefix
Prefix substring.
See Also
--------
contains : Check if the string repr contains a substring that matches a pattern.
ends_with : Check if string repr ends with a substring.
Examples
--------
>>> s = pl.Series("fruits", ["apple", "mango", None], dtype=pl.Categorical)
>>> s.cat.starts_with("app")
shape: (3,)
Series: 'fruits' [bool]
[
true
false
null
]
"""
def ends_with(self, suffix: str) -> Series:
"""
Check if string representations of values end with a substring.
Parameters
----------
suffix
Suffix substring.
See Also
--------
contains : Check if the string repr contains a substring that matches a pattern.
starts_with : Check if string repr starts with a substring.
Examples
--------
>>> s = pl.Series("fruits", ["apple", "mango", None], dtype=pl.Categorical)
>>> s.cat.ends_with("go")
shape: (3,)
Series: 'fruits' [bool]
[
false
true
null
]
"""
def slice(self, offset: int, length: int | None = None) -> Series:
"""
Extract a substring from the string representation of each string value.
Parameters
----------
offset
Start index. Negative indexing is supported.
length
Length of the slice. If set to `None` (default), the slice is taken to the
end of the string.
Returns
-------
Series
Series of data type :class:`String`.
Notes
-----
Both the `offset` and `length` inputs are defined in terms of the number
of characters in the (UTF8) string. A character is defined as a
`Unicode scalar value`_. A single character is represented by a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
Examples
--------
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"], dtype=pl.Categorical)
>>> s.cat.slice(-3)
shape: (4,)
Series: '' [str]
[
"ear"
null
"aya"
"uit"
]
Using the optional `length` parameter
>>> s.cat.slice(4, length=3)
shape: (4,)
Series: '' [str]
[
""
null
"ya"
"onf"
]
"""