DriverTrac/venv/lib/python3.12/site-packages/polars/sql/context.py

from __future__ import annotations

import contextlib
import re
from typing import (
    TYPE_CHECKING,
    Callable,
    Generic,
    Union,
    overload,
)

from polars._dependencies import _check_for_pandas, _check_for_pyarrow
from polars._dependencies import pandas as pd
from polars._dependencies import pyarrow as pa
from polars._typing import FrameType
from polars._utils.deprecation import deprecate_renamed_parameter
from polars._utils.pycapsule import is_pycapsule
from polars._utils.unstable import issue_unstable_warning
from polars._utils.various import _get_stack_locals, qualified_type_name
from polars._utils.wrap import wrap_ldf
from polars.convert import from_arrow, from_pandas
from polars.dataframe import DataFrame
from polars.lazyframe import LazyFrame
from polars.series import Series

with contextlib.suppress(ImportError):  # Module not available when building docs
    from polars._plr import PySQLContext

if TYPE_CHECKING:
    import sys
    from collections.abc import Collection, Mapping
    from types import TracebackType
    from typing import Any, Final, Literal

    if sys.version_info >= (3, 10):
        from typing import TypeAlias
    else:
        from typing_extensions import TypeAlias

    if sys.version_info >= (3, 11):
        from typing import Self
    else:
        from typing_extensions import Self

    CompatibleFrameType: TypeAlias = Union[
        DataFrame,
        LazyFrame,
        Series,
        pd.DataFrame,
        pd.Series[Any],
        pa.Table,
        pa.RecordBatch,
    ]

__all__ = ["SQLContext"]


def _compatible_frame(obj: Any) -> bool:
    """Check if the object can be converted to DataFrame."""
    return (
        is_pycapsule(obj)
        or isinstance(obj, LazyFrame)
        or (_check_for_pandas(obj) and isinstance(obj, (pd.DataFrame, pd.Series)))
        or (_check_for_pyarrow(obj) and isinstance(obj, (pa.Table, pa.RecordBatch)))
    )


def _ensure_lazyframe(obj: Any) -> LazyFrame:
    """Return LazyFrame from compatible input."""
    if isinstance(obj, (DataFrame, LazyFrame)):
        return obj.lazy()
    elif isinstance(obj, Series):
        return obj.to_frame().lazy()
    elif _check_for_pandas(obj) and isinstance(obj, (pd.DataFrame, pd.Series)):
        if isinstance(frame := from_pandas(obj), Series):
            frame = frame.to_frame()
        return frame.lazy()
    elif is_pycapsule(obj) or (
        _check_for_pyarrow(obj) and isinstance(obj, (pa.Table, pa.RecordBatch))
    ):
        return from_arrow(obj).lazy()  # type: ignore[union-attr]
    else:
        msg = f"unrecognised frame type: {qualified_type_name(obj)}"
        raise ValueError(msg)


def _get_frame_locals(
    *,
    all_compatible: bool,
    n_objects: int | None = None,
    named: str | Collection[str] | Callable[[str], bool] | None = None,
) -> dict[str, Any]:
    """Return compatible frame objects from the local stack."""
    of_type = _compatible_frame if all_compatible else (DataFrame, LazyFrame, Series)
    return _get_stack_locals(of_type=of_type, n_objects=n_objects, named=named)  # type: ignore[arg-type]


class SQLContext(Generic[FrameType]):
    """
    Run SQL queries against DataFrame, LazyFrame, and Series data.

    .. warning::
        This functionality is considered **unstable**, although it is close to being
        considered stable. It may be changed at any point without it being considered
        a breaking change.
    """

    _ctxt: PySQLContext
    _eager_execution: Final[bool]
    _tables_scope_stack: list[set[str]]

    # note: the type-overloaded methods are required to support accurate typing
    # of the frame return from "execute" (which may be DataFrame or LazyFrame),
    # as that is influenced by both the "eager" flag at init-time AND the "eager"
    # flag at query-time (if anyone can find a lighter-weight set of annotations
    # that successfully resolves this, please go for it... ;)

    @overload
    def __init__(
        self: SQLContext[LazyFrame],
        frames: Mapping[str, CompatibleFrameType | None] | None = ...,
        *,
        register_globals: bool | int = ...,
        all_compatible: bool = ...,
        eager: Literal[False] = False,
        **named_frames: CompatibleFrameType | None,
    ) -> None: ...

    @overload
    def __init__(
        self: SQLContext[DataFrame],
        frames: Mapping[str, CompatibleFrameType | None] | None = ...,
        *,
        register_globals: bool | int = ...,
        all_compatible: bool = ...,
        eager: Literal[True],
        **named_frames: CompatibleFrameType | None,
    ) -> None: ...

    @overload
    def __init__(
        self: SQLContext[DataFrame],
        frames: Mapping[str, CompatibleFrameType | None] | None = ...,
        *,
        register_globals: bool | int = ...,
        all_compatible: bool = ...,
        eager: bool,
        **named_frames: CompatibleFrameType | None,
    ) -> None: ...

    @deprecate_renamed_parameter("eager_execution", "eager", version="0.20.31")
    def __init__(
        self,
        frames: Mapping[str, CompatibleFrameType | None] | None = None,
        *,
        register_globals: bool | int = False,
        eager: bool = False,
        **named_frames: CompatibleFrameType | None,
    ) -> None:
        """
        Initialize a new `SQLContext`.

        .. versionchanged:: 0.20.31
            The `eager_execution` parameter was renamed `eager`.

        Parameters
        ----------
        frames
            A `{name:frame, ...}` mapping which can include Polars frames *and*
            pandas DataFrames, Series and pyarrow Table and RecordBatch objects.
        register_globals
            Register compatible objects (polars DataFrame, LazyFrame, and Series) found
            in the globals, automatically mapping their variable name to a table name.
            To register other objects (pandas/pyarrow data) pass them explicitly, or
            call the `execute_global` classmethod. If given an integer then only the
            most recent "n" objects found will be registered.
        eager
            If True, returns execution results as `DataFrame` instead of `LazyFrame`.
            (Note that the query itself is always executed in lazy-mode; this parameter
            impacts whether :meth:`execute` returns an eager or lazy result frame).
        **named_frames
            Named eager/lazy frames, provided as kwargs.

        Examples
        --------
        >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["x", None, "z"]})
        >>> res = pl.SQLContext(frame=lf).execute(
        ...     "SELECT b, a*2 AS two_a FROM frame WHERE b IS NOT NULL"
        ... )
        >>> res.collect()
        shape: (2, 2)
        ┌─────┬───────┐
        │ b   ┆ two_a │
        │ --- ┆ ---   │
        │ str ┆ i64   │
        ╞═════╪═══════╡
        │ x   ┆ 2     │
        │ z   ┆ 6     │
        └─────┴───────┘
        """
        issue_unstable_warning(
            "`SQLContext` is considered **unstable**, although it is close to being considered stable."
        )
        self._ctxt = PySQLContext.new()
        self._eager_execution = eager

        frames = dict(frames or {})
        if register_globals:
            for name, obj in _get_frame_locals(
                all_compatible=False,
            ).items():
                if name not in frames and name not in named_frames:
                    named_frames[name] = obj

        if frames or named_frames:
            frames.update(named_frames)
            self.register_many(frames)

    @overload
    @classmethod
    def execute_global(
        cls, query: str, *, eager: Literal[False] = False
    ) -> LazyFrame: ...

    @overload
    @classmethod
    def execute_global(cls, query: str, *, eager: Literal[True]) -> DataFrame: ...

    @overload
    @classmethod
    def execute_global(cls, query: str, *, eager: bool) -> DataFrame | LazyFrame: ...

    @classmethod
    def execute_global(
        cls, query: str, *, eager: bool = False
    ) -> DataFrame | LazyFrame:
        """
        Immediately execute a SQL query, automatically registering frame globals.

        Notes
        -----
        * This convenience method automatically registers all compatible objects in
          the local stack that are referenced in the query, mapping their variable name
          to a table name. Note that in addition to polars DataFrame, LazyFrame, and
          Series this method *also* registers pandas DataFrame, Series, and pyarrow
          Table and RecordBatch objects.
        * Instead of calling this classmethod you should consider using `pl.sql`,
          which will use this code internally.

        Parameters
        ----------
        query
            A valid SQL query string.
        eager
            If True, returns execution results as `DataFrame` instead of `LazyFrame`.
            (Note that the query itself is always executed in lazy-mode).

        Examples
        --------
        >>> import pandas as pd
        >>> df = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
        >>> df_pandas = pd.DataFrame({"a": [2, 3, 4], "c": [7, 8, 9]})

        Join a polars LazyFrame with a pandas DataFrame (note use of the preferred
        `pl.sql` method, which is equivalent to `SQLContext.execute_global`):

        >>> pl.sql("SELECT df.*, c FROM df JOIN df_pandas USING(a)").collect()
        shape: (2, 3)
        ┌─────┬─────┬─────┐
        │ a   ┆ b   ┆ c   │
        │ --- ┆ --- ┆ --- │
        │ i64 ┆ i64 ┆ i64 │
        ╞═════╪═════╪═════╡
        │ 2   ┆ 5   ┆ 7   │
        │ 3   ┆ 6   ┆ 8   │
        └─────┴─────┴─────┘
        """
        # basic extraction of possible table names from the query, so we don't register
        # unnecessary objects from the globals (ideally we shuoold look to make the
        # underlying `sqlparser-rs` lib parse the query to identify table names)
        q = re.split(r"\bFROM\b", query, maxsplit=1, flags=re.I)
        possible_names = (
            {
                nm.strip('"')
                for nm in re.split(r"\b", q[1])
                if re.match(r'^("[^"]+")$', nm) or nm.isidentifier()
            }
            if len(q) > 1
            else set()
        )
        # get compatible frame objects from the globals, constraining by possible names
        named_frames = _get_frame_locals(all_compatible=True, named=possible_names)
        with cls(frames=named_frames, register_globals=False) as ctx:
            return ctx.execute(query=query, eager=eager)

    def __enter__(self) -> SQLContext[FrameType]:
        """Track currently registered tables on scope entry; supports nested scopes."""
        self._tables_scope_stack = getattr(self, "_tables_scope_stack", [])
        self._tables_scope_stack.append(set(self.tables()))
        return self

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        """
        Unregister any tables created within the given scope on context exit.

        See Also
        --------
        unregister
        """
        self.unregister(
            names=(set(self.tables()) - self._tables_scope_stack.pop()),
        )

    def __repr__(self) -> str:
        n_tables = len(self.tables())
        return f"<SQLContext [tables:{n_tables}] at 0x{id(self):x}>"

    # these overloads are necessary to cover the possible permutations
    # of the init-time "eager" param, and the local "eager" param.

    @overload
    def execute(
        self: SQLContext[DataFrame], query: str, *, eager: None = ...
    ) -> DataFrame: ...

    @overload
    def execute(
        self: SQLContext[DataFrame], query: str, *, eager: Literal[False]
    ) -> LazyFrame: ...

    @overload
    def execute(
        self: SQLContext[DataFrame], query: str, *, eager: Literal[True]
    ) -> DataFrame: ...

    @overload
    def execute(
        self: SQLContext[LazyFrame], query: str, *, eager: None = ...
    ) -> LazyFrame: ...

    @overload
    def execute(
        self: SQLContext[LazyFrame], query: str, *, eager: Literal[False]
    ) -> LazyFrame: ...

    @overload
    def execute(
        self: SQLContext[LazyFrame], query: str, *, eager: Literal[True]
    ) -> DataFrame: ...

    @overload
    def execute(
        self, query: str, *, eager: bool | None = ...
    ) -> LazyFrame | DataFrame: ...

    def execute(
        self, query: str, *, eager: bool | None = None
    ) -> LazyFrame | DataFrame:
        """
        Parse the given SQL query and execute it against the registered frame data.

        Parameters
        ----------
        query
            A valid string SQL query.
        eager
            Apply the query eagerly, returning `DataFrame` instead of `LazyFrame`.
            If unset, the value of the init-time "eager" parameter will be used.
            Note that the query itself is always executed in lazy-mode; this
            parameter only impacts the type of the returned frame.

        Examples
        --------
        Declare frame data and register with a SQLContext:

        >>> df = pl.DataFrame(
        ...     data=[
        ...         ("The Godfather", 1972, 6_000_000, 134_821_952, 9.2),
        ...         ("The Dark Knight", 2008, 185_000_000, 533_316_061, 9.0),
        ...         ("Schindler's List", 1993, 22_000_000, 96_067_179, 8.9),
        ...         ("Pulp Fiction", 1994, 8_000_000, 107_930_000, 8.9),
        ...         ("The Shawshank Redemption", 1994, 25_000_000, 28_341_469, 9.3),
        ...     ],
        ...     schema=["title", "release_year", "budget", "gross", "imdb_score"],
        ...     orient="row",
        ... )
        >>> ctx = pl.SQLContext(films=df)

        Execute a SQL query against the registered frame data:

        >>> ctx.execute(
        ...     '''
        ...     SELECT title, release_year, imdb_score
        ...     FROM films
        ...     WHERE release_year > 1990
        ...     ORDER BY imdb_score DESC
        ...     ''',
        ...     eager=True,
        ... )
        shape: (4, 3)
        ┌──────────────────────────┬──────────────┬────────────┐
        │ title                    ┆ release_year ┆ imdb_score │
        │ ---                      ┆ ---          ┆ ---        │
        │ str                      ┆ i64          ┆ f64        │
        ╞══════════════════════════╪══════════════╪════════════╡
        │ The Shawshank Redemption ┆ 1994         ┆ 9.3        │
        │ The Dark Knight          ┆ 2008         ┆ 9.0        │
        │ Schindler's List         ┆ 1993         ┆ 8.9        │
        │ Pulp Fiction             ┆ 1994         ┆ 8.9        │
        └──────────────────────────┴──────────────┴────────────┘

        Execute a GROUP BY query:

        >>> ctx.execute(
        ...     '''
        ...     SELECT
        ...         MAX(release_year / 10) * 10 AS decade,
        ...         SUM(gross) AS total_gross,
        ...         COUNT(title) AS n_films,
        ...     FROM films
        ...     GROUP BY (release_year / 10) -- decade
        ...     ORDER BY total_gross DESC
        ...     ''',
        ...     eager=True,
        ... )
        shape: (3, 3)
        ┌────────┬─────────────┬─────────┐
        │ decade ┆ total_gross ┆ n_films │
        │ ---    ┆ ---         ┆ ---     │
        │ i64    ┆ i64         ┆ u32     │
        ╞════════╪═════════════╪═════════╡
        │ 2000   ┆ 533316061   ┆ 1       │
        │ 1990   ┆ 232338648   ┆ 3       │
        │ 1970   ┆ 134821952   ┆ 1       │
        └────────┴─────────────┴─────────┘
        """
        res = wrap_ldf(self._ctxt.execute(query))
        return res.collect() if (eager or self._eager_execution) else res

    def register(self, name: str, frame: CompatibleFrameType | None) -> Self:
        """
        Register a single frame as a table, using the given name.

        Parameters
        ----------
        name
            Name of the table.
        frame
            eager/lazy frame to associate with this table name.

        See Also
        --------
        register_globals
        register_many
        unregister

        Examples
        --------
        >>> df = pl.DataFrame({"hello": ["world"]})
        >>> ctx = pl.SQLContext()
        >>> ctx.register("frame_data", df).execute("SELECT * FROM frame_data").collect()
        shape: (1, 1)
        ┌───────┐
        │ hello │
        │ ---   │
        │ str   │
        ╞═══════╡
        │ world │
        └───────┘
        """
        frame = LazyFrame() if frame is None else _ensure_lazyframe(frame)
        self._ctxt.register(name, frame._ldf)
        return self

    def register_globals(
        self, n: int | None = None, *, all_compatible: bool = True
    ) -> Self:
        """
        Register all frames (lazy or eager) found in the current globals scope.

        Automatically maps variable names to table names.

        See Also
        --------
        register
        register_many
        unregister

        Parameters
        ----------
        n
            Register only the most recent "n" frames.
        all_compatible
            Control whether we *also* register pandas DataFrame, Series, and
            pyarrow Table and RecordBatch objects. If False, only Polars
            classes are registered with the SQL engine.

        Examples
        --------
        >>> df1 = pl.DataFrame({"a": [1, 2, 3], "b": ["x", None, "z"]})
        >>> df2 = pl.DataFrame({"a": [2, 3, 4], "c": ["t", "w", "v"]})

        Register frames directly from variables found in the current globals scope:

        >>> ctx = pl.SQLContext(register_globals=True)
        >>> ctx.tables()
        ['df1', 'df2']

        Query using the register variable/frame names

        >>> ctx.execute(
        ...     "SELECT a, b, c FROM df1 LEFT JOIN df2 USING (a) ORDER BY a DESC"
        ... ).collect()
        shape: (3, 3)
        ┌─────┬──────┬──────┐
        │ a   ┆ b    ┆ c    │
        │ --- ┆ ---  ┆ ---  │
        │ i64 ┆ str  ┆ str  │
        ╞═════╪══════╪══════╡
        │ 3   ┆ z    ┆ w    │
        │ 2   ┆ null ┆ t    │
        │ 1   ┆ x    ┆ null │
        └─────┴──────┴──────┘
        """
        frames = _get_frame_locals(all_compatible=all_compatible, n_objects=n)
        return self.register_many(frames=frames)

    def register_many(
        self,
        frames: Mapping[str, CompatibleFrameType | None] | None = None,
        **named_frames: CompatibleFrameType | None,
    ) -> Self:
        """
        Register multiple eager/lazy frames as tables, using the associated names.

        Parameters
        ----------
        frames
            A `{name:frame, ...}` mapping.
        **named_frames
            Named eager/lazy frames, provided as kwargs.

        See Also
        --------
        register
        register_globals
        unregister

        Examples
        --------
        >>> lf1 = pl.LazyFrame({"a": [1, 2, 3], "b": ["m", "n", "o"]})
        >>> lf2 = pl.LazyFrame({"a": [2, 3, 4], "c": ["p", "q", "r"]})
        >>> lf3 = pl.LazyFrame({"a": [3, 4, 5], "b": ["s", "t", "u"]})
        >>> lf4 = pl.LazyFrame({"a": [4, 5, 6], "c": ["v", "w", "x"]})

        Register multiple frames at once, either by passing in as a dict...

        >>> ctx = pl.SQLContext().register_many({"tbl1": lf1, "tbl2": lf2})
        >>> ctx.tables()
        ['tbl1', 'tbl2']

        ...or using keyword args:

        >>> ctx.register_many(tbl3=lf3, tbl4=lf4).tables()
        ['tbl1', 'tbl2', 'tbl3', 'tbl4']
        """
        frames = dict(frames or {})
        frames.update(named_frames)
        for name, frame in frames.items():
            self.register(name, frame)
        return self

    def unregister(self, names: str | Collection[str]) -> Self:
        """
        Unregister one or more eager/lazy frames by name.

        Parameters
        ----------
        names
            Names of the tables to unregister.

        Notes
        -----
        You can also control table registration lifetime by using `SQLContext` as a
        context manager; this can often be more useful when such control is wanted:

        >>> df0 = pl.DataFrame({"colx": [0, 1, 2]})
        >>> df1 = pl.DataFrame({"colx": [1, 2, 3]})
        >>> df2 = pl.DataFrame({"colx": [2, 3, 4]})

        Frames registered in-scope are automatically unregistered on scope-exit. Note
        that frames registered on construction will persist through subsequent scopes.

        >>> # register one frame at construction time, and the other two in-scope
        >>> with pl.SQLContext(tbl0=df0) as ctx:
        ...     ctx.register_many(tbl1=df1, tbl2=df2).tables()
        ['tbl0', 'tbl1', 'tbl2']

        After scope exit, none of the tables registered in-scope remain:

        >>> ctx.tables()
        ['tbl0']

        See Also
        --------
        register
        register_globals
        register_many

        Examples
        --------
        >>> df0 = pl.DataFrame({"ints": [9, 8, 7, 6, 5]})
        >>> lf1 = pl.LazyFrame({"text": ["a", "b", "c"]})
        >>> lf2 = pl.LazyFrame({"misc": ["testing1234"]})

        Register with a SQLContext object:

        >>> ctx = pl.SQLContext(test1=df0, test2=lf1, test3=lf2)
        >>> ctx.tables()
        ['test1', 'test2', 'test3']

        Unregister one or more of the tables:

        >>> ctx.unregister(["test1", "test3"]).tables()
        ['test2']
        >>> ctx.unregister("test2").tables()
        []
        """
        if isinstance(names, str):
            names = [names]
        for nm in names:
            self._ctxt.unregister(nm)
        return self

    def tables(self) -> list[str]:
        """
        Return a list of the registered table names.

        Notes
        -----
        The :meth:`tables` method will return the same values as the
        "SHOW TABLES" SQL statement, but as a list instead of a frame.

        Executing as SQL:

        >>> frame_data = pl.DataFrame({"hello": ["world"]})
        >>> ctx = pl.SQLContext(hello_world=frame_data)
        >>> ctx.execute("SHOW TABLES", eager=True)
        shape: (1, 1)
        ┌─────────────┐
        │ name        │
        │ ---         │
        │ str         │
        ╞═════════════╡
        │ hello_world │
        └─────────────┘

        Calling the method:

        >>> ctx.tables()
        ['hello_world']

        Examples
        --------
        >>> df1 = pl.DataFrame({"hello": ["world"]})
        >>> df2 = pl.DataFrame({"foo": ["bar", "baz"]})
        >>> ctx = pl.SQLContext(hello_data=df1, foo_bar=df2)
        >>> ctx.tables()
        ['foo_bar', 'hello_data']
        """
        return sorted(self._ctxt.get_tables())