188 lines
5.5 KiB
Python
188 lines
5.5 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
from collections.abc import Iterator
|
|
from typing import TYPE_CHECKING, Callable
|
|
|
|
import polars._reexport as pl
|
|
from polars._utils.unstable import unstable
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Iterator
|
|
from typing import Callable
|
|
|
|
from polars import DataFrame, Expr, LazyFrame
|
|
from polars._typing import SchemaDict
|
|
|
|
|
|
@unstable()
|
|
def register_io_source(
|
|
io_source: Callable[
|
|
[list[str] | None, Expr | None, int | None, int | None], Iterator[DataFrame]
|
|
],
|
|
*,
|
|
schema: Callable[[], SchemaDict] | SchemaDict,
|
|
validate_schema: bool = False,
|
|
is_pure: bool = False,
|
|
) -> LazyFrame:
|
|
"""
|
|
Register your IO plugin and initialize a LazyFrame.
|
|
|
|
See the `user guide <https://docs.pola.rs/user-guide/plugins/io_plugins>`_
|
|
for more information about plugins.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
io_source
|
|
Function that accepts the following arguments:
|
|
with_columns
|
|
Columns that are projected. The reader must
|
|
project these columns if applied
|
|
predicate
|
|
Polars expression. The reader must filter
|
|
their rows accordingly.
|
|
n_rows
|
|
Materialize only n rows from the source.
|
|
The reader can stop when `n_rows` are read.
|
|
batch_size
|
|
A hint of the ideal batch size the reader's
|
|
generator must produce.
|
|
|
|
The function should return a an iterator/generator
|
|
that produces DataFrames.
|
|
schema
|
|
Schema or function that when called produces the schema that the reader
|
|
will produce before projection pushdown.
|
|
validate_schema
|
|
Whether the engine should validate if the batches generated match
|
|
the given schema. It's an implementation error if this isn't
|
|
the case and can lead to bugs that are hard to solve.
|
|
is_pure
|
|
Whether the IO source is pure. Repeated occurrences of same IO source in
|
|
a LazyFrame plan can be de-duplicated during optimization if they are
|
|
pure.
|
|
|
|
Returns
|
|
-------
|
|
LazyFrame
|
|
"""
|
|
|
|
def wrap(
|
|
with_columns: list[str] | None,
|
|
predicate: bytes | None,
|
|
n_rows: int | None,
|
|
batch_size: int | None,
|
|
) -> tuple[Iterator[DataFrame], bool]:
|
|
parsed_predicate_success = True
|
|
parsed_predicate = None
|
|
if predicate:
|
|
try:
|
|
parsed_predicate = pl.Expr.deserialize(predicate)
|
|
except Exception as e:
|
|
if os.environ.get("POLARS_VERBOSE"):
|
|
print(
|
|
f"failed parsing IO plugin expression\n\nfilter will be handled on Polars' side: {e}",
|
|
file=sys.stderr,
|
|
)
|
|
parsed_predicate_success = False
|
|
|
|
return io_source(
|
|
with_columns, parsed_predicate, n_rows, batch_size
|
|
), parsed_predicate_success
|
|
|
|
return pl.LazyFrame._scan_python_function(
|
|
schema=schema,
|
|
scan_fn=wrap,
|
|
pyarrow=False,
|
|
validate_schema=validate_schema,
|
|
is_pure=is_pure,
|
|
)
|
|
|
|
|
|
@unstable()
|
|
def _defer(
|
|
function: Callable[[], DataFrame],
|
|
*,
|
|
schema: SchemaDict | Callable[[], SchemaDict],
|
|
validate_schema: bool = True,
|
|
) -> LazyFrame:
|
|
"""
|
|
Deferred execution.
|
|
|
|
Takes a function that produces a `DataFrame` but defers execution until the
|
|
`LazyFrame` is collected.
|
|
|
|
Parameters
|
|
----------
|
|
function
|
|
Function that takes no arguments and produces a `DataFrame`.
|
|
schema
|
|
Schema of the `DataFrame` the deferred function will return.
|
|
The caller must ensure this schema is correct.
|
|
validate_schema
|
|
Whether the engine should validate if the batches generated match
|
|
the given schema. It's an implementation error if this isn't
|
|
the case and can lead to bugs that are hard to solve.
|
|
|
|
Examples
|
|
--------
|
|
Delay DataFrame execution until query is executed.
|
|
|
|
>>> import numpy as np
|
|
>>> np.random.seed(0)
|
|
>>> lf = pl.defer(
|
|
... lambda: pl.DataFrame({"a": np.random.randn(3)}), schema={"a": pl.Float64}
|
|
... )
|
|
>>> lf.collect()
|
|
shape: (3, 1)
|
|
┌──────────┐
|
|
│ a │
|
|
│ --- │
|
|
│ f64 │
|
|
╞══════════╡
|
|
│ 1.764052 │
|
|
│ 0.400157 │
|
|
│ 0.978738 │
|
|
└──────────┘
|
|
|
|
Run an eager source in Polars Cloud
|
|
|
|
>>> (
|
|
... pl.defer(
|
|
... lambda: pl.read_database("select * from tbl"),
|
|
... schema={"a": pl.Float64, "b": pl.Boolean},
|
|
... )
|
|
... .filter("b")
|
|
... .sum("a")
|
|
... .remote()
|
|
... .collect()
|
|
... ) # doctest: +SKIP
|
|
|
|
|
|
"""
|
|
|
|
def source(
|
|
with_columns: list[str] | None,
|
|
predicate: Expr | None,
|
|
n_rows: int | None,
|
|
batch_size: int | None,
|
|
) -> Iterator[DataFrame]:
|
|
lf = function().lazy()
|
|
if with_columns is not None:
|
|
lf = lf.select(with_columns)
|
|
if predicate is not None:
|
|
lf = lf.filter(predicate)
|
|
if n_rows is not None:
|
|
lf = lf.limit(n_rows)
|
|
yield lf.collect()
|
|
|
|
return register_io_source(
|
|
io_source=source, schema=schema, validate_schema=validate_schema
|
|
)
|