247 lines
6.4 KiB
Python
247 lines
6.4 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from polars._utils.unstable import unstable
|
|
from polars._utils.wrap import wrap_s
|
|
from polars.series.utils import expr_dispatch
|
|
|
|
if TYPE_CHECKING:
|
|
from polars import Series
|
|
from polars._plr import PySeries
|
|
|
|
|
|
@expr_dispatch
|
|
class CatNameSpace:
|
|
"""Namespace for categorical related series."""
|
|
|
|
_accessor = "cat"
|
|
|
|
def __init__(self, series: Series) -> None:
|
|
self._s: PySeries = series._s
|
|
|
|
def get_categories(self) -> Series:
|
|
"""
|
|
Get the categories stored in this data type.
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series(["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical)
|
|
>>> s.cat.get_categories() # doctest: +SKIP
|
|
shape: (3,)
|
|
Series: '' [str]
|
|
[
|
|
"foo"
|
|
"bar"
|
|
"ham"
|
|
]
|
|
"""
|
|
|
|
def is_local(self) -> bool:
|
|
"""
|
|
Return whether or not the column is a local categorical.
|
|
|
|
Always returns false.
|
|
"""
|
|
return self._s.cat_is_local()
|
|
|
|
def to_local(self) -> Series:
|
|
"""Simply returns the column as-is, local representations are deprecated."""
|
|
return wrap_s(self._s.cat_to_local())
|
|
|
|
@unstable()
|
|
def uses_lexical_ordering(self) -> bool:
|
|
"""
|
|
Indicate whether the Series uses lexical ordering.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series(["b", "a", "b"]).cast(pl.Categorical)
|
|
>>> s.cat.uses_lexical_ordering()
|
|
True
|
|
"""
|
|
return self._s.cat_uses_lexical_ordering()
|
|
|
|
def len_bytes(self) -> Series:
|
|
"""
|
|
Return the byte-length of the string representation of each value.
|
|
|
|
Returns
|
|
-------
|
|
Series
|
|
Series of data type :class:`UInt32`.
|
|
|
|
See Also
|
|
--------
|
|
len_chars
|
|
|
|
Notes
|
|
-----
|
|
When working with non-ASCII text, the length in bytes is not the same as the
|
|
length in characters. You may want to use :func:`len_chars` instead.
|
|
Note that :func:`len_bytes` is much more performant (_O(1)_) than
|
|
:func:`len_chars` (_O(n)_).
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series(["Café", "345", "東京", None], dtype=pl.Categorical)
|
|
>>> s.cat.len_bytes()
|
|
shape: (4,)
|
|
Series: '' [u32]
|
|
[
|
|
5
|
|
3
|
|
6
|
|
null
|
|
]
|
|
"""
|
|
|
|
def len_chars(self) -> Series:
|
|
"""
|
|
Return the number of characters of the string representation of each value.
|
|
|
|
Returns
|
|
-------
|
|
Series
|
|
Series of data type :class:`UInt32`.
|
|
|
|
See Also
|
|
--------
|
|
len_bytes
|
|
|
|
Notes
|
|
-----
|
|
When working with ASCII text, use :func:`len_bytes` instead to achieve
|
|
equivalent output with much better performance:
|
|
:func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
|
|
|
|
A character is defined as a `Unicode scalar value`_. A single character is
|
|
represented by a single byte when working with ASCII text, and a maximum of
|
|
4 bytes otherwise.
|
|
|
|
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series(["Café", "345", "東京", None], dtype=pl.Categorical)
|
|
>>> s.cat.len_chars()
|
|
shape: (4,)
|
|
Series: '' [u32]
|
|
[
|
|
4
|
|
3
|
|
2
|
|
null
|
|
]
|
|
"""
|
|
|
|
def starts_with(self, prefix: str) -> Series:
|
|
"""
|
|
Check if string representations of values start with a substring.
|
|
|
|
Parameters
|
|
----------
|
|
prefix
|
|
Prefix substring.
|
|
|
|
See Also
|
|
--------
|
|
contains : Check if the string repr contains a substring that matches a pattern.
|
|
ends_with : Check if string repr ends with a substring.
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series("fruits", ["apple", "mango", None], dtype=pl.Categorical)
|
|
>>> s.cat.starts_with("app")
|
|
shape: (3,)
|
|
Series: 'fruits' [bool]
|
|
[
|
|
true
|
|
false
|
|
null
|
|
]
|
|
"""
|
|
|
|
def ends_with(self, suffix: str) -> Series:
|
|
"""
|
|
Check if string representations of values end with a substring.
|
|
|
|
Parameters
|
|
----------
|
|
suffix
|
|
Suffix substring.
|
|
|
|
See Also
|
|
--------
|
|
contains : Check if the string repr contains a substring that matches a pattern.
|
|
starts_with : Check if string repr starts with a substring.
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series("fruits", ["apple", "mango", None], dtype=pl.Categorical)
|
|
>>> s.cat.ends_with("go")
|
|
shape: (3,)
|
|
Series: 'fruits' [bool]
|
|
[
|
|
false
|
|
true
|
|
null
|
|
]
|
|
"""
|
|
|
|
def slice(self, offset: int, length: int | None = None) -> Series:
|
|
"""
|
|
Extract a substring from the string representation of each string value.
|
|
|
|
Parameters
|
|
----------
|
|
offset
|
|
Start index. Negative indexing is supported.
|
|
length
|
|
Length of the slice. If set to `None` (default), the slice is taken to the
|
|
end of the string.
|
|
|
|
Returns
|
|
-------
|
|
Series
|
|
Series of data type :class:`String`.
|
|
|
|
Notes
|
|
-----
|
|
Both the `offset` and `length` inputs are defined in terms of the number
|
|
of characters in the (UTF8) string. A character is defined as a
|
|
`Unicode scalar value`_. A single character is represented by a single byte
|
|
when working with ASCII text, and a maximum of 4 bytes otherwise.
|
|
|
|
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"], dtype=pl.Categorical)
|
|
>>> s.cat.slice(-3)
|
|
shape: (4,)
|
|
Series: '' [str]
|
|
[
|
|
"ear"
|
|
null
|
|
"aya"
|
|
"uit"
|
|
]
|
|
|
|
Using the optional `length` parameter
|
|
|
|
>>> s.cat.slice(4, length=3)
|
|
shape: (4,)
|
|
Series: '' [str]
|
|
[
|
|
""
|
|
null
|
|
"ya"
|
|
"onf"
|
|
]
|
|
"""
|