141 lines
4.5 KiB
Python
141 lines
4.5 KiB
Python
from __future__ import annotations
|
|
|
|
from collections.abc import Mapping, Sequence
|
|
from typing import Any
|
|
|
|
|
|
def _parquet_field_overwrites_dict_to_dict_list(
|
|
pqo: dict[str, ParquetFieldOverwrites],
|
|
) -> list[dict[str, Any]]:
|
|
children = []
|
|
for name, child in pqo.items():
|
|
if child.name is not None:
|
|
msg = "ParquetFieldOverwrites has both a name in the dictionary and in the overwrites"
|
|
raise ValueError(msg)
|
|
child.name = name
|
|
children.append(_parquet_field_overwrites_to_dict(child))
|
|
return children
|
|
|
|
|
|
def _parquet_field_overwrites_to_dict(pqo: ParquetFieldOverwrites) -> dict[str, Any]:
|
|
d: dict[str, Any] = {}
|
|
|
|
# Name
|
|
if pqo.name is not None:
|
|
d["name"] = pqo.name
|
|
|
|
# Children
|
|
if pqo.children is not None:
|
|
if isinstance(pqo.children, ParquetFieldOverwrites):
|
|
d["children"] = _parquet_field_overwrites_to_dict(pqo.children)
|
|
elif isinstance(pqo.children, dict):
|
|
d["children"] = _parquet_field_overwrites_dict_to_dict_list(pqo.children)
|
|
elif isinstance(pqo.children, list):
|
|
d["children"] = [_parquet_field_overwrites_to_dict(c) for c in pqo.children]
|
|
else:
|
|
msg = "invalid ParquetFieldOverwrites children type"
|
|
raise TypeError(msg)
|
|
|
|
if pqo.field_id is not None:
|
|
d["field_id"] = pqo.field_id
|
|
|
|
# Metadata
|
|
if pqo.metadata is not None:
|
|
d["metadata"] = list(pqo.metadata.items())
|
|
|
|
if pqo.required is not None:
|
|
d["required"] = pqo.required
|
|
|
|
return d
|
|
|
|
|
|
class ParquetFieldOverwrites:
|
|
"""
|
|
Write-option overwrites for individual Parquet fields.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> lf = pl.LazyFrame(
|
|
... {
|
|
... "a": [None, 2, 3, 4],
|
|
... "b": [[1, 2, 3], [42], [13], [37]],
|
|
... "c": [
|
|
... {"x": "a", "y": 42},
|
|
... {"x": "b", "y": 13},
|
|
... {"x": "X", "y": 37},
|
|
... {"x": "Y", "y": 15},
|
|
... ],
|
|
... }
|
|
... ) # doctest: +SKIP
|
|
>>> lf.sink_parquet(
|
|
... "./out/parquet",
|
|
... field_overwrites={
|
|
... "a": ParquetFieldOverwrites(metadata={"flat_from_polars": "yes"}),
|
|
... "b": ParquetFieldOverwrites(
|
|
... children=ParquetFieldOverwrites(metadata={"listitem": "yes"}),
|
|
... metadata={"list": "true"},
|
|
... ),
|
|
... "c": ParquetFieldOverwrites(
|
|
... children=[
|
|
... ParquetFieldOverwrites(name="x", metadata={"md": "yes"}),
|
|
... ParquetFieldOverwrites(name="y", metadata={"md2": "Yes!"}),
|
|
... ],
|
|
... metadata={"struct": "true"},
|
|
... ),
|
|
... },
|
|
... ) # doctest: +SKIP
|
|
"""
|
|
|
|
name: None | str #: Name of the column or field
|
|
children: (
|
|
None
|
|
| ParquetFieldOverwrites
|
|
| list[ParquetFieldOverwrites]
|
|
| dict[str, ParquetFieldOverwrites]
|
|
) #: Children of the column or field.
|
|
#
|
|
# For flat types (e.g. `Int32`), this should be `None`. For lists, this can be a
|
|
# unnamed `ParquetFieldOverwrites`. For structs, this can be a dict or list of named
|
|
# overwrites.
|
|
|
|
field_id: int | None = None #: The field ID used in the Parquet schema
|
|
metadata: (
|
|
dict[str, None | str] | None
|
|
) #: Arrow metadata added to the field before writing
|
|
required: bool | None = None #: Is the field not allowed to have missing values
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
name: str | None = None,
|
|
children: (
|
|
None
|
|
| ParquetFieldOverwrites
|
|
| Sequence[ParquetFieldOverwrites]
|
|
| Mapping[str, ParquetFieldOverwrites]
|
|
) = None,
|
|
field_id: int | None = None,
|
|
metadata: Mapping[str, None | str] | None = None,
|
|
required: bool | None = None,
|
|
) -> None:
|
|
self.name = name
|
|
|
|
if isinstance(children, Mapping):
|
|
self.children = dict(children)
|
|
elif isinstance(children, Sequence):
|
|
self.children = list(children)
|
|
else:
|
|
self.children = children
|
|
|
|
self.field_id = field_id
|
|
if isinstance(metadata, Mapping):
|
|
self.metadata = dict(metadata)
|
|
else:
|
|
self.metadata = metadata
|
|
self.required = required
|