From b5261fe4fcb6c594cddef935dff712f7a290f4ec Mon Sep 17 00:00:00 2001 From: Mark Molinaro Date: Sat, 28 Feb 2026 00:48:27 +0000 Subject: [PATCH] [DOCS] add section for running `pyspark` with `uv` inline dependencies --- dev/requirements.txt | 2 +- .../docs/source/tutorial/python_packaging.rst | 58 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index a219a017cf92f..05dba33758863 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -29,7 +29,7 @@ mypy==1.19.1 pytest-mypy-plugins==3.2.0 # See SPARK-38680. pandas-stubs>=2.2.0 -scipy-stubs; +scipy-stubs types-PyYAML # Documentation (SQL) diff --git a/python/docs/source/tutorial/python_packaging.rst b/python/docs/source/tutorial/python_packaging.rst index 3adfb5b4b371b..4803be9adb95d 100644 --- a/python/docs/source/tutorial/python_packaging.rst +++ b/python/docs/source/tutorial/python_packaging.rst @@ -57,6 +57,7 @@ There are multiple ways to manage Python dependencies in the cluster: - Using Conda - Using Virtualenv - Using PEX +- Using uv Using PySpark Native Features @@ -251,3 +252,60 @@ An end-to-end Docker example for deploying a standalone PySpark with ``SparkSess can be found `here `_ - it uses cluster-pack, a library on top of PEX that automatizes the intermediate step of having to create & upload the PEX manually. + + +Using uv run +------------ + +`uv `_ can run self-contained Python scripts with inline dependency +declarations and no manual environment management. Dependencies are resolved and installed +automatically on first run. + +To use it, create a wrapper script called ``uv_run``: + +.. code-block:: bash + + exec uv run [--python ] "$@" + +and make it executable with ``chmod +x uv_run``. + +You may pass ``--python `` to select a specific Python interpreter +(see `using different Python versions `_): + +Set ``PYSPARK_PYTHON`` to the wrapper script: + +.. code-block:: bash + + export PYSPARK_PYTHON=./uv_run + pyspark app.py + +Then, within your Python script, you may declare dependencies using `PEP 723 inline script metadata `_: + +.. code-block:: python + + #!/usr/bin/env -S uv run --script + # + # /// script + # dependencies = [ + # "pandas==2.2.3", + # ] + # /// + + import pandas as pd + from pyspark.sql import SparkSession + from pyspark.sql.functions import pandas_udf + + spark = SparkSession.builder.master("local[*]").appName("UvRunExample").getOrCreate() + + df = spark.createDataFrame( + [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], + ("id", "v"), + ) + + @pandas_udf("double") + def mean_udf(v: pd.Series) -> float: + return v.mean() + + print(df.groupby("id").agg(mean_udf(df["v"])).collect()) + + spark.stop()