80 lines
2.6 KiB
Python
80 lines
2.6 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from polars._utils.unstable import unstable
|
|
from polars.io.pyarrow_dataset.anonymous_scan import _scan_pyarrow_dataset
|
|
|
|
if TYPE_CHECKING:
|
|
from polars import LazyFrame
|
|
from polars._dependencies import pyarrow as pa
|
|
|
|
|
|
@unstable()
|
|
def scan_pyarrow_dataset(
|
|
source: pa.dataset.Dataset,
|
|
*,
|
|
allow_pyarrow_filter: bool = True,
|
|
batch_size: int | None = None,
|
|
) -> LazyFrame:
|
|
"""
|
|
Scan a pyarrow dataset.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
This can be useful to connect to cloud or partitioned datasets.
|
|
|
|
Parameters
|
|
----------
|
|
source
|
|
Pyarrow dataset to scan.
|
|
allow_pyarrow_filter
|
|
Allow predicates to be pushed down to pyarrow. This can lead to different
|
|
results if comparisons are done with null values as pyarrow handles this
|
|
different than polars does.
|
|
batch_size
|
|
The maximum row count for scanned pyarrow record batches.
|
|
|
|
Warnings
|
|
--------
|
|
Don't use this if you accept untrusted user inputs. Predicates will be evaluated
|
|
with python 'eval'. There is sanitation in place, but it is a possible attack
|
|
vector.
|
|
This method can only can push down predicates that are allowed by PyArrow
|
|
(e.g. not the full Polars API).
|
|
|
|
If :func:`scan_parquet` works for your source, you should use that instead.
|
|
|
|
Notes
|
|
-----
|
|
When using partitioning, the appropriate `partitioning` option must be set on
|
|
`pyarrow.dataset.dataset` before passing to Polars or the partitioned-on column(s)
|
|
may not get passed to Polars.
|
|
|
|
Examples
|
|
--------
|
|
>>> import pyarrow.dataset as ds
|
|
>>> dset = ds.dataset("s3://my-partitioned-folder/", format="ipc") # doctest: +SKIP
|
|
>>> (
|
|
... pl.scan_pyarrow_dataset(dset)
|
|
... .filter("bools")
|
|
... .select("bools", "floats", "date")
|
|
... .collect()
|
|
... ) # doctest: +SKIP
|
|
shape: (1, 3)
|
|
┌───────┬────────┬────────────┐
|
|
│ bools ┆ floats ┆ date │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ bool ┆ f64 ┆ date │
|
|
╞═══════╪════════╪════════════╡
|
|
│ true ┆ 2.0 ┆ 1970-05-04 │
|
|
└───────┴────────┴────────────┘
|
|
"""
|
|
return _scan_pyarrow_dataset(
|
|
source,
|
|
allow_pyarrow_filter=allow_pyarrow_filter,
|
|
batch_size=batch_size,
|
|
)
|