-
Notifications
You must be signed in to change notification settings - Fork 155
feat: expose variety of features from DF54 update #1554
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
a2e086d
95eee71
96c3a14
51fc2bc
4131899
5387f30
3c052ef
c2e0881
430d4ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,13 +86,16 @@ | |
|
|
||
| import pandas as pd | ||
| import polars as pl # type: ignore[import] | ||
| from _typeshed import CapsuleType as _PyCapsule | ||
|
|
||
| from datafusion.catalog import CatalogProvider, Table | ||
| from datafusion.common import DFSchema | ||
| from datafusion.expr import Expr, SortKey | ||
| from datafusion.plan import ExecutionPlan, LogicalPlan | ||
| from datafusion.user_defined import ( | ||
| AggregateUDF, | ||
| LogicalExtensionCodecExportable, | ||
| PhysicalExtensionCodecExportable, | ||
| ScalarUDF, | ||
| TableFunction, | ||
| WindowUDF, | ||
|
|
@@ -959,6 +962,45 @@ def register_record_batches( | |
| """ | ||
| self.ctx.register_record_batches(name, partitions) | ||
|
|
||
| def read_batch(self, batch: pa.RecordBatch) -> DataFrame: | ||
| """Return a :py:class:`~datafusion.DataFrame` reading a single batch. | ||
|
|
||
| Convenience wrapper around :py:meth:`read_batches` for the single-batch | ||
| case. Unlike :py:meth:`register_batch`, this does not register the | ||
| batch as a named table; it returns an anonymous | ||
| :py:class:`~datafusion.DataFrame` directly. | ||
|
|
||
| Args: | ||
| batch: Record batch to wrap as a DataFrame. | ||
|
|
||
| Examples: | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> batch = pa.RecordBatch.from_pydict({"a": [1, 2, 3]}) | ||
| >>> ctx.read_batch(batch).to_pydict() | ||
| {'a': [1, 2, 3]} | ||
| """ | ||
| return self.read_batches([batch]) | ||
|
|
||
| def read_batches(self, batches: list[pa.RecordBatch]) -> DataFrame: | ||
| """Return a :py:class:`~datafusion.DataFrame` reading the given batches. | ||
|
|
||
| All batches must share the same schema. Unlike | ||
| :py:meth:`register_record_batches`, this does not register the batches | ||
| as a named table; it returns an anonymous | ||
| :py:class:`~datafusion.DataFrame` directly. | ||
|
|
||
| Args: | ||
| batches: Record batches to wrap as a DataFrame. | ||
|
|
||
| Examples: | ||
| >>> ctx = dfn.SessionContext() | ||
| >>> b1 = pa.RecordBatch.from_pydict({"a": [1, 2]}) | ||
| >>> b2 = pa.RecordBatch.from_pydict({"a": [3, 4]}) | ||
| >>> ctx.read_batches([b1, b2]).to_pydict() | ||
| {'a': [1, 2, 3, 4]} | ||
| """ | ||
| return DataFrame(self.ctx.read_batches(batches)) | ||
|
|
||
| def register_parquet( | ||
| self, | ||
| name: str, | ||
|
|
@@ -1268,6 +1310,65 @@ def deregister_udwf(self, name: str) -> None: | |
| """ | ||
| self.ctx.deregister_udwf(name) | ||
|
|
||
| def udf(self, name: str) -> ScalarUDF: | ||
| """Look up a registered scalar UDF by name. | ||
|
|
||
| Args: | ||
| name: Name of the registered scalar UDF. | ||
|
|
||
| Raises: | ||
| Exception: If no scalar UDF is registered under ``name``. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't recall if this is the convention across the code base but on quick look I'd expect to just return ScalarUDF | None |
||
| """ | ||
| from datafusion.user_defined import ScalarUDF as _ScalarUDF # noqa: PLC0415 | ||
|
|
||
| wrapper = _ScalarUDF.__new__(_ScalarUDF) | ||
| wrapper._udf = self.ctx.udf(name) | ||
| return wrapper | ||
|
|
||
| def udaf(self, name: str) -> AggregateUDF: | ||
| """Look up a registered aggregate UDF by name. | ||
|
|
||
| Args: | ||
| name: Name of the registered aggregate UDF. | ||
|
|
||
| Raises: | ||
| Exception: If no aggregate UDF is registered under ``name``. | ||
| """ | ||
| from datafusion.user_defined import ( # noqa: PLC0415 | ||
| AggregateUDF as _AggregateUDF, | ||
| ) | ||
|
|
||
| wrapper = _AggregateUDF.__new__(_AggregateUDF) | ||
| wrapper._udaf = self.ctx.udaf(name) | ||
| return wrapper | ||
|
|
||
| def udwf(self, name: str) -> WindowUDF: | ||
| """Look up a registered window UDF by name. | ||
|
|
||
| Args: | ||
| name: Name of the registered window UDF. | ||
|
|
||
| Raises: | ||
| Exception: If no window UDF is registered under ``name``. | ||
| """ | ||
| from datafusion.user_defined import WindowUDF as _WindowUDF # noqa: PLC0415 | ||
|
|
||
| wrapper = _WindowUDF.__new__(_WindowUDF) | ||
| wrapper._udwf = self.ctx.udwf(name) | ||
| return wrapper | ||
|
|
||
| def udfs(self) -> list[str]: | ||
| """Return the sorted names of all registered scalar UDFs.""" | ||
| return self.ctx.udfs() | ||
|
|
||
| def udafs(self) -> list[str]: | ||
| """Return the sorted names of all registered aggregate UDFs.""" | ||
| return self.ctx.udafs() | ||
|
|
||
| def udwfs(self) -> list[str]: | ||
| """Return the sorted names of all registered window UDFs.""" | ||
| return self.ctx.udwfs() | ||
|
|
||
| def catalog(self, name: str = "datafusion") -> Catalog: | ||
| """Retrieve a catalog by name.""" | ||
| return Catalog(self.ctx.catalog(name)) | ||
|
|
@@ -1744,11 +1845,15 @@ def __datafusion_logical_extension_codec__(self) -> Any: | |
| """Access the PyCapsule FFI_LogicalExtensionCodec.""" | ||
| return self.ctx.__datafusion_logical_extension_codec__() | ||
|
|
||
| def with_logical_extension_codec(self, codec: Any) -> SessionContext: | ||
| def with_logical_extension_codec( | ||
| self, codec: LogicalExtensionCodecExportable | _PyCapsule | ||
| ) -> SessionContext: | ||
| """Create a new session context with specified codec. | ||
|
|
||
| This only supports codecs that have been implemented using the | ||
| FFI interface. | ||
| FFI interface. ``codec`` must either be a raw ``FFI_LogicalExtensionCodec`` | ||
| ``PyCapsule`` or an object exposing | ||
| ``__datafusion_logical_extension_codec__``. | ||
|
Comment on lines
+1756
to
+1758
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this addition? Isn't this redundant with the typing? |
||
| """ | ||
| new_internal = self.ctx.with_logical_extension_codec(codec) | ||
| new = SessionContext.__new__(SessionContext) | ||
|
|
@@ -1759,11 +1864,15 @@ def __datafusion_physical_extension_codec__(self) -> Any: | |
| """Access the PyCapsule FFI_PhysicalExtensionCodec.""" | ||
| return self.ctx.__datafusion_physical_extension_codec__() | ||
|
|
||
| def with_physical_extension_codec(self, codec: Any) -> SessionContext: | ||
| def with_physical_extension_codec( | ||
| self, codec: PhysicalExtensionCodecExportable | _PyCapsule | ||
| ) -> SessionContext: | ||
| """Create a new session context with the specified physical codec. | ||
|
|
||
| This only supports codecs that have been implemented using the | ||
| FFI interface. | ||
| FFI interface. ``codec`` must either be a raw | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto on shadowing type hint |
||
| ``FFI_PhysicalExtensionCodec`` ``PyCapsule`` or an object exposing | ||
| ``__datafusion_physical_extension_codec__``. | ||
| """ | ||
| new_internal = self.ctx.with_physical_extension_codec(codec) | ||
| new = SessionContext.__new__(SessionContext) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2727,14 +2727,24 @@ def arrow_metadata(expr: Expr, key: Expr | str | None = None) -> Expr: | |
| return Expr(f.arrow_metadata(expr.expr, key.expr)) | ||
|
|
||
|
|
||
| def get_field(expr: Expr, name: Expr | str) -> Expr: | ||
| """Extracts a field from a struct or map by name. | ||
| def get_field(expr: Expr, *names: Expr | str) -> Expr: | ||
|
timsaucer marked this conversation as resolved.
|
||
| """Extracts a (possibly nested) field from a struct or map by name. | ||
|
|
||
| When the field name is a static string, the bracket operator | ||
| ``expr["field"]`` is a convenient shorthand. Use ``get_field`` | ||
| when the field name is a dynamic expression. | ||
| Pass one name for a single-level lookup, or several names to walk a path | ||
| of nested struct/map fields in a single ``get_field`` call. For a single | ||
| static-string name, ``expr["field"]`` is a convenient shorthand; use | ||
| ``get_field`` when the field name is a dynamic | ||
| :py:class:`~datafusion.expr.Expr` or when traversing multiple levels at | ||
| once. | ||
|
|
||
| Args: | ||
| expr: The struct or map expression to read from. | ||
| *names: One or more field names (``str``) or expressions | ||
| (:py:class:`~datafusion.expr.Expr`). | ||
|
|
||
| Examples: | ||
| Single-level lookup: | ||
|
|
||
| >>> ctx = dfn.SessionContext() | ||
| >>> df = ctx.from_pydict({"a": [1], "b": [2]}) | ||
| >>> df = df.with_column( | ||
|
|
@@ -2756,10 +2766,26 @@ def get_field(expr: Expr, name: Expr | str) -> Expr: | |
| ... ) | ||
| >>> result.collect_column("x_val")[0].as_py() | ||
| 1 | ||
|
|
||
| Multi-level lookup: | ||
|
|
||
| >>> df = df.with_column( | ||
| ... "outer", | ||
| ... dfn.functions.named_struct([("inner", dfn.col("s"))]), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIT: Not required here but the doctest namespace already imports functions a F which would make this addition less verbose. |
||
| ... ) | ||
| >>> result = df.select( | ||
| ... dfn.functions.get_field( | ||
| ... dfn.col("outer"), "inner", "x" | ||
| ... ).alias("x_val") | ||
| ... ) | ||
| >>> result.collect_column("x_val")[0].as_py() | ||
| 1 | ||
| """ | ||
| if isinstance(name, str): | ||
| name = Expr.string_literal(name) | ||
| return Expr(f.get_field(expr.expr, name.expr)) | ||
| if not names: | ||
| msg = "get_field requires at least one field name" | ||
| raise ValueError(msg) | ||
| resolved = [Expr.string_literal(n) if isinstance(n, str) else n for n in names] | ||
| return Expr(f.get_field(expr.expr, [n.expr for n in resolved])) | ||
|
|
||
|
|
||
| def union_extract(union_expr: Expr, field_name: Expr | str) -> Expr: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would consider it more pythonic for read_batches to accept RecordBatch | Iterable[RecordBatches]