DriverTrac/venv/lib/python3.12/site-packages/polars/expr/binary.py

347 lines
14 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
from polars._utils.parse import parse_into_expression
from polars._utils.various import scale_bytes
from polars._utils.wrap import wrap_expr
from polars.datatypes import parse_into_datatype_expr
if TYPE_CHECKING:
from polars import DataTypeExpr, Expr
from polars._typing import (
Endianness,
IntoExpr,
PolarsDataType,
SizeUnit,
TransferEncoding,
)
class ExprBinaryNameSpace:
"""Namespace for bin related expressions."""
_accessor = "bin"
def __init__(self, expr: Expr) -> None:
self._pyexpr = expr._pyexpr
def contains(self, literal: IntoExpr) -> Expr:
r"""
Check if binaries in Series contain a binary substring.
Parameters
----------
literal
The binary substring to look for
Returns
-------
Expr
Expression of data type :class:`Boolean`.
See Also
--------
starts_with : Check if the binary substring exists at the start
ends_with : Check if the binary substring exists at the end
Examples
--------
>>> colors = pl.DataFrame(
... {
... "name": ["black", "yellow", "blue"],
... "code": [b"\x00\x00\x00", b"\xff\xff\x00", b"\x00\x00\xff"],
... "lit": [b"\x00", b"\xff\x00", b"\xff\xff"],
... }
... )
>>> colors.select(
... "name",
... pl.col("code").bin.contains(b"\xff").alias("contains_with_lit"),
... pl.col("code").bin.contains(pl.col("lit")).alias("contains_with_expr"),
... )
shape: (3, 3)
┌────────┬───────────────────┬────────────────────┐
│ name ┆ contains_with_lit ┆ contains_with_expr │
│ --- ┆ --- ┆ --- │
│ str ┆ bool ┆ bool │
╞════════╪═══════════════════╪════════════════════╡
│ black ┆ false ┆ true │
│ yellow ┆ true ┆ true │
│ blue ┆ true ┆ false │
└────────┴───────────────────┴────────────────────┘
"""
literal_pyexpr = parse_into_expression(literal, str_as_lit=True)
return wrap_expr(self._pyexpr.bin_contains(literal_pyexpr))
def ends_with(self, suffix: IntoExpr) -> Expr:
r"""
Check if string values end with a binary substring.
Parameters
----------
suffix
Suffix substring.
Returns
-------
Expr
Expression of data type :class:`Boolean`.
See Also
--------
starts_with : Check if the binary substring exists at the start
contains : Check if the binary substring exists anywhere
Examples
--------
>>> colors = pl.DataFrame(
... {
... "name": ["black", "yellow", "blue"],
... "code": [b"\x00\x00\x00", b"\xff\xff\x00", b"\x00\x00\xff"],
... "suffix": [b"\x00", b"\xff\x00", b"\x00\x00"],
... }
... )
>>> colors.select(
... "name",
... pl.col("code").bin.ends_with(b"\xff").alias("ends_with_lit"),
... pl.col("code").bin.ends_with(pl.col("suffix")).alias("ends_with_expr"),
... )
shape: (3, 3)
┌────────┬───────────────┬────────────────┐
│ name ┆ ends_with_lit ┆ ends_with_expr │
│ --- ┆ --- ┆ --- │
│ str ┆ bool ┆ bool │
╞════════╪═══════════════╪════════════════╡
│ black ┆ false ┆ true │
│ yellow ┆ false ┆ true │
│ blue ┆ true ┆ false │
└────────┴───────────────┴────────────────┘
"""
suffix_pyexpr = parse_into_expression(suffix, str_as_lit=True)
return wrap_expr(self._pyexpr.bin_ends_with(suffix_pyexpr))
def starts_with(self, prefix: IntoExpr) -> Expr:
r"""
Check if values start with a binary substring.
Parameters
----------
prefix
Prefix substring.
Returns
-------
Expr
Expression of data type :class:`Boolean`.
See Also
--------
ends_with : Check if the binary substring exists at the end
contains : Check if the binary substring exists anywhere
Examples
--------
>>> colors = pl.DataFrame(
... {
... "name": ["black", "yellow", "blue"],
... "code": [b"\x00\x00\x00", b"\xff\xff\x00", b"\x00\x00\xff"],
... "prefix": [b"\x00", b"\xff\x00", b"\x00\x00"],
... }
... )
>>> colors.select(
... "name",
... pl.col("code").bin.starts_with(b"\xff").alias("starts_with_lit"),
... pl.col("code")
... .bin.starts_with(pl.col("prefix"))
... .alias("starts_with_expr"),
... )
shape: (3, 3)
┌────────┬─────────────────┬──────────────────┐
│ name ┆ starts_with_lit ┆ starts_with_expr │
│ --- ┆ --- ┆ --- │
│ str ┆ bool ┆ bool │
╞════════╪═════════════════╪══════════════════╡
│ black ┆ false ┆ true │
│ yellow ┆ true ┆ false │
│ blue ┆ false ┆ true │
└────────┴─────────────────┴──────────────────┘
"""
prefix_pyexpr = parse_into_expression(prefix, str_as_lit=True)
return wrap_expr(self._pyexpr.bin_starts_with(prefix_pyexpr))
def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Expr:
r"""
Decode values using the provided encoding.
Parameters
----------
encoding : {'hex', 'base64'}
The encoding to use.
strict
Raise an error if the underlying value cannot be decoded,
otherwise mask out with a null value.
Returns
-------
Expr
Expression of data type :class:`Binary`.
Examples
--------
>>> colors = pl.DataFrame(
... {
... "name": ["black", "yellow", "blue"],
... "encoded": [b"000000", b"ffff00", b"0000ff"],
... }
... )
>>> colors.with_columns(
... pl.col("encoded").bin.decode("hex").alias("code"),
... )
shape: (3, 3)
┌────────┬───────────┬─────────────────┐
│ name ┆ encoded ┆ code │
│ --- ┆ --- ┆ --- │
│ str ┆ binary ┆ binary │
╞════════╪═══════════╪═════════════════╡
│ black ┆ b"000000" ┆ b"\x00\x00\x00"
│ yellow ┆ b"ffff00" ┆ b"\xff\xff\x00"
│ blue ┆ b"0000ff" ┆ b"\x00\x00\xff"
└────────┴───────────┴─────────────────┘
"""
if encoding == "hex":
return wrap_expr(self._pyexpr.bin_hex_decode(strict))
elif encoding == "base64":
return wrap_expr(self._pyexpr.bin_base64_decode(strict))
else:
msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
raise ValueError(msg)
def encode(self, encoding: TransferEncoding) -> Expr:
r"""
Encode a value using the provided encoding.
Parameters
----------
encoding : {'hex', 'base64'}
The encoding to use.
Returns
-------
Expr
Expression of data type :class:`Binary`.
Examples
--------
>>> colors = pl.DataFrame(
... {
... "color": ["black", "yellow", "blue"],
... "code": [b"\x00\x00\x00", b"\xff\xff\x00", b"\x00\x00\xff"],
... }
... )
>>> colors.with_columns(
... pl.col("code").bin.encode("hex").alias("encoded"),
... )
shape: (3, 3)
┌────────┬─────────────────┬─────────┐
│ color ┆ code ┆ encoded │
│ --- ┆ --- ┆ --- │
│ str ┆ binary ┆ str │
╞════════╪═════════════════╪═════════╡
│ black ┆ b"\x00\x00\x00" ┆ 000000 │
│ yellow ┆ b"\xff\xff\x00" ┆ ffff00 │
│ blue ┆ b"\x00\x00\xff" ┆ 0000ff │
└────────┴─────────────────┴─────────┘
"""
if encoding == "hex":
return wrap_expr(self._pyexpr.bin_hex_encode())
elif encoding == "base64":
return wrap_expr(self._pyexpr.bin_base64_encode())
else:
msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
raise ValueError(msg)
def size(self, unit: SizeUnit = "b") -> Expr:
r"""
Get the size of binary values in the given unit.
Parameters
----------
unit : {'b', 'kb', 'mb', 'gb', 'tb'}
Scale the returned size to the given unit.
Returns
-------
Expr
Expression of data type :class:`UInt32` or `Float64`.
Examples
--------
>>> from os import urandom
>>> df = pl.DataFrame({"data": [urandom(n) for n in (512, 256, 1024)]})
>>> df.with_columns( # doctest: +IGNORE_RESULT
... n_bytes=pl.col("data").bin.size(),
... n_kilobytes=pl.col("data").bin.size("kb"),
... )
shape: (4, 3)
┌─────────────────────────────────┬─────────┬─────────────┐
│ data ┆ n_bytes ┆ n_kilobytes │
│ --- ┆ --- ┆ --- │
│ binary ┆ u32 ┆ f64 │
╞═════════════════════════════════╪═════════╪═════════════╡
│ b"y?~B\x83\xf4V\x07\xd3\xfb\xb… ┆ 512 ┆ 0.5 │
│ b"\xee$4@f\xc14\x07\x8e\x88\x1… ┆ 256 ┆ 0.25 │
│ b"\x80\xbd\xb9nEq;2\x99$\xf9\x… ┆ 1024 ┆ 1.0 │
└─────────────────────────────────┴─────────┴─────────────┘
"""
sz = wrap_expr(self._pyexpr.bin_size_bytes())
sz = scale_bytes(sz, unit)
return sz
def reinterpret(
self, *, dtype: PolarsDataType | DataTypeExpr, endianness: Endianness = "little"
) -> Expr:
r"""
Interpret bytes as another type.
Supported types are numerical or temporal dtypes, or an ``Array`` of
these dtypes.
Parameters
----------
dtype : PolarsDataType
Which type to interpret binary column into.
endianness : {"big", "little"}, optional
Which endianness to use when interpreting bytes, by default "little".
Returns
-------
Expr
Expression of data type `dtype`.
Note that rows of the binary array where the length does not match
the size in bytes of the output array (number of items * byte size
of item) will become NULL.
Examples
--------
>>> df = pl.DataFrame({"data": [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"]})
>>> df.with_columns( # doctest: +IGNORE_RESULT
... bin2int=pl.col("data").bin.reinterpret(
... dtype=pl.Int32, endianness="little"
... ),
... )
shape: (2, 2)
┌─────────────────────┬─────────┐
│ data ┆ bin2int │
│ --- ┆ --- │
│ binary ┆ i32 │
╞═════════════════════╪═════════╡
│ b"\x05\x00\x00\x00" ┆ 5 │
│ b"\x10\x00\x01\x00" ┆ 65552 │
└─────────────────────┴─────────┘
"""
dtype = parse_into_datatype_expr(dtype)
return wrap_expr(
self._pyexpr.bin_reinterpret(dtype._pydatatype_expr, endianness)
)