DriverTrac/venv/lib/python3.12/site-packages/polars/io/parquet/field_overwrites.py

141 lines
4.5 KiB
Python

from __future__ import annotations
from collections.abc import Mapping, Sequence
from typing import Any
def _parquet_field_overwrites_dict_to_dict_list(
pqo: dict[str, ParquetFieldOverwrites],
) -> list[dict[str, Any]]:
children = []
for name, child in pqo.items():
if child.name is not None:
msg = "ParquetFieldOverwrites has both a name in the dictionary and in the overwrites"
raise ValueError(msg)
child.name = name
children.append(_parquet_field_overwrites_to_dict(child))
return children
def _parquet_field_overwrites_to_dict(pqo: ParquetFieldOverwrites) -> dict[str, Any]:
d: dict[str, Any] = {}
# Name
if pqo.name is not None:
d["name"] = pqo.name
# Children
if pqo.children is not None:
if isinstance(pqo.children, ParquetFieldOverwrites):
d["children"] = _parquet_field_overwrites_to_dict(pqo.children)
elif isinstance(pqo.children, dict):
d["children"] = _parquet_field_overwrites_dict_to_dict_list(pqo.children)
elif isinstance(pqo.children, list):
d["children"] = [_parquet_field_overwrites_to_dict(c) for c in pqo.children]
else:
msg = "invalid ParquetFieldOverwrites children type"
raise TypeError(msg)
if pqo.field_id is not None:
d["field_id"] = pqo.field_id
# Metadata
if pqo.metadata is not None:
d["metadata"] = list(pqo.metadata.items())
if pqo.required is not None:
d["required"] = pqo.required
return d
class ParquetFieldOverwrites:
"""
Write-option overwrites for individual Parquet fields.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
Examples
--------
>>> lf = pl.LazyFrame(
... {
... "a": [None, 2, 3, 4],
... "b": [[1, 2, 3], [42], [13], [37]],
... "c": [
... {"x": "a", "y": 42},
... {"x": "b", "y": 13},
... {"x": "X", "y": 37},
... {"x": "Y", "y": 15},
... ],
... }
... ) # doctest: +SKIP
>>> lf.sink_parquet(
... "./out/parquet",
... field_overwrites={
... "a": ParquetFieldOverwrites(metadata={"flat_from_polars": "yes"}),
... "b": ParquetFieldOverwrites(
... children=ParquetFieldOverwrites(metadata={"listitem": "yes"}),
... metadata={"list": "true"},
... ),
... "c": ParquetFieldOverwrites(
... children=[
... ParquetFieldOverwrites(name="x", metadata={"md": "yes"}),
... ParquetFieldOverwrites(name="y", metadata={"md2": "Yes!"}),
... ],
... metadata={"struct": "true"},
... ),
... },
... ) # doctest: +SKIP
"""
name: None | str #: Name of the column or field
children: (
None
| ParquetFieldOverwrites
| list[ParquetFieldOverwrites]
| dict[str, ParquetFieldOverwrites]
) #: Children of the column or field.
#
# For flat types (e.g. `Int32`), this should be `None`. For lists, this can be a
# unnamed `ParquetFieldOverwrites`. For structs, this can be a dict or list of named
# overwrites.
field_id: int | None = None #: The field ID used in the Parquet schema
metadata: (
dict[str, None | str] | None
) #: Arrow metadata added to the field before writing
required: bool | None = None #: Is the field not allowed to have missing values
def __init__(
self,
*,
name: str | None = None,
children: (
None
| ParquetFieldOverwrites
| Sequence[ParquetFieldOverwrites]
| Mapping[str, ParquetFieldOverwrites]
) = None,
field_id: int | None = None,
metadata: Mapping[str, None | str] | None = None,
required: bool | None = None,
) -> None:
self.name = name
if isinstance(children, Mapping):
self.children = dict(children)
elif isinstance(children, Sequence):
self.children = list(children)
else:
self.children = children
self.field_id = field_id
if isinstance(metadata, Mapping):
self.metadata = dict(metadata)
else:
self.metadata = metadata
self.required = required