From 4704062d0d0bd24fdc66e1d85a04a495717d2d47 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:27:17 +0200 Subject: [PATCH 1/8] Support for pandas dataframes. --- README.md | 37 ++++++++++++++++++++++++++++++------- pyproject.toml | 5 ++++- requirements.txt | 3 ++- src/jsoniq/sequence.py | 5 ++++- src/jsoniq/session.py | 27 +++++++++++++++++++++------ 5 files changed, 61 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 403c76b..f8b64bb 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ When passing Python values to JSONiq or getting them from a JSONiq queries, the | Python | JSONiq | |-------|-------| +|tuple|sequence of items| |dict|object| |list|array| |str|string| @@ -73,6 +74,7 @@ You can directly copy paste the code below to a Python file and execute it with ``` from jsoniq import RumbleSession +import pandas as pd # The syntax to start a session is similar to that of Spark. # A RumbleSession is a SparkSession that additionally knows about RumbleDB. @@ -155,16 +157,16 @@ print(seq.json()); ###### Binding JSONiq variables to Python values ########### ############################################################ -# It is possible to bind a JSONiq variable to a list of native Python values +# It is possible to bind a JSONiq variable to a tuple of native Python values # and then use it in a query. # JSONiq, variables are bound to sequences of items, just like the results of JSONiq # queries are sequence of items. -# A Python list will be seamlessly converted to a sequence of items by the library. +# A Python tuple will be seamlessly converted to a sequence of items by the library. # Currently we only support strs, ints, floats, booleans, None, lists, and dicts. # But if you need more (like date, bytes, etc) we will add them without any problem. # JSONiq has a rich type system. -rumble.bind('$c', [1,2,3,4, 5, 6]) +rumble.bind('$c', (1,2,3,4, 5, 6)) print(rumble.jsoniq(""" for $v in $c let $parity := $v mod 2 @@ -176,7 +178,7 @@ return { switch($parity) } """).json()) -rumble.bind('$c', [[1,2,3],[4,5,6]]) +rumble.bind('$c', ([1,2,3],[4,5,6])) print(rumble.jsoniq(""" for $i in $c return [ @@ -185,18 +187,34 @@ return [ ] """).json()) -rumble.bind('$c', [{"foo":[1,2,3]},{"foo":[4,{"bar":[1,False, None]},6]}]) +rumble.bind('$c', ({"foo":[1,2,3]},{"foo":[4,{"bar":[1,False, None]},6]})) print(rumble.jsoniq('{ "results" : $c.foo[[2]] }').json()) -# It is possible to bind only one value. The it must be provided as a singleton list. +# It is possible to bind only one value. The it must be provided as a singleton tuple. # This is because in JSONiq, an item is the same a sequence of one item. -rumble.bind('$c', [42]) +rumble.bind('$c', (42,)) print(rumble.jsoniq('for $i in 1 to $c return $i*$i').json()) # For convenience and code readability, you can also use bindOne(). rumble.bindOne('$c', 42) print(rumble.jsoniq('for $i in 1 to $c return $i*$i').json()) +########################################################## +##### Binding JSONiq variables to pandas DataFrames ###### +##### Getting the output as a Pandas DataFrame ###### +########################################################## + +# Creating a dummy pandas dataframe +data = {'Name': ['Alice', 'Bob', 'Charlie'], + 'Age': [30,25,35]}; +pdf = pd.DataFrame(data); + +# Binding a pandas dataframe +rumble.bind('$a',pdf); +seq = rumble.jsoniq('$a.Name') +# Getting the output as a pandas dataframe +print(seq.pdf()) + ################################################ ##### Using Pyspark DataFrames with JSONiq ##### @@ -324,6 +342,11 @@ Even more queries can be found [here](https://colab.research.google.com/github/R # Last updates +## Version 0.1.0 alpha 13 +- Allow to bind JSONiq variables to pandas dataframes +- Allow to retrieve the output of a JSONiq query as a pandas dataframes (if the output is available as a dataframe, i.e., availableOutputs() returns a list that contains "DataFrame") +- Clean up the mapping to strictly map tuples to sequence of items, and lists ot array items. This will avoid confusion between arrays and sequences. + ## Version 0.1.0 alpha 12 - Allow to bind JSONiq variables to Python values (mapping Python lists to sequences of items). This makes it possible to manipulate Python values directly with JSONiq and even without any knowledge of Spark at all. - renamed bindDataFrameAsVariable() to bind(), which can be used both with DataFrames and Python lists. diff --git a/pyproject.toml b/pyproject.toml index b0dc675..dfb628a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ version = "0.1.0a12" description = "Python edition of RumbleDB, a JSONiq engine" requires-python = ">=3.11" dependencies = [ - "pyspark==4.0" + "pyspark==4.0", + "pandas==2.3" ] authors = [ {name = "Ghislain Fourny", email = "ghislain.fourny@inf.ethz.ch"}, @@ -23,6 +24,8 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Typing :: Typed", + "License :: OSI Approved :: Apache Software License" ] [tool.setuptools.packages.find] diff --git a/requirements.txt b/requirements.txt index ea9b258..0368767 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -pyspark==4.0.0 +pyspark==4.0 +pandas==2.3 diff --git a/src/jsoniq/sequence.py b/src/jsoniq/sequence.py index f7f2b5d..b0bdc5e 100644 --- a/src/jsoniq/sequence.py +++ b/src/jsoniq/sequence.py @@ -10,7 +10,7 @@ def __init__(self, sequence, sparksession): self._sparksession = sparksession def json(self): - return [json.loads(l.serializeAsJSON()) for l in self._jsequence.items()] + return tuple([json.loads(l.serializeAsJSON()) for l in self._jsequence.items()]) def rdd(self): rdd = self._jsequence.getAsPickledStringRDD(); @@ -20,6 +20,9 @@ def rdd(self): def df(self): return DataFrame(self._jsequence.getAsDataFrame(), self._sparksession) + def pdf(self): + return self.df().toPandas() + def nextJSON(self): return self._jsequence.next().serializeAsJSON() diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py index 66ddd95..28f6085 100644 --- a/src/jsoniq/session.py +++ b/src/jsoniq/session.py @@ -4,6 +4,7 @@ import platform import os import re +import pandas as pd import importlib.resources as pkg_resources with pkg_resources.path("jsoniq.jars", "rumbledb-1.24.0.jar") as jar_path: @@ -84,6 +85,8 @@ def __getattr__(self, name): _builder = Builder() def convert(self, value): + if isinstance(value, tuple): + return [ self.convert(v) for v in value] if isinstance(value, bool): return self._sparksession._jvm.org.rumbledb.items.ItemFactory.getInstance().createBooleanItem(value) elif isinstance(value, str): @@ -114,18 +117,30 @@ def bind(self, name: str, valueToBind): if not name.startswith("$"): raise ValueError("Variable name must start with a dollar symbol ('$').") name = name[1:] - if isinstance(valueToBind, list): - items = [ self.convert(value) for value in valueToBind] - conf.setExternalVariableValue(name, items) - return self - if(hasattr(valueToBind, "_get_object_id")): + if isinstance(valueToBind, SequenceOfItems): + outputs = valueToBind.availableOutputs() + if isinstance(outputs, list) and "DataFrame" in outputs: + conf.setExternalVariableValue(name, valueToBind.df()); + # TODO support binding a variable to an RDD + #elif isinstance(outputs, list) and "RDD" in outputs: + # conf.setExternalVariableValue(name, valueToBind.getAsRDD()); + else: + conf.setExternalVariableValue(name, valueToBind.items()); + elif isinstance(valueToBind, pd.DataFrame): + pysparkdf = self._sparksession.createDataFrame(valueToBind) + conf.setExternalVariableValue(name, pysparkdf._jdf); + elif isinstance(valueToBind, tuple): + conf.setExternalVariableValue(name, self.convert(valueToBind)) + elif isinstance(valueToBind, list): + raise ValueError("To avoid confusion, a sequence of items must be provided as a Python tuple, not as a Python list. Lists are mapped to single array items, while tuples are mapped to sequences of items. If you want to bind the variable to one array item, then you need to wrap the provided list inside a singleton tuple and try again, or you can also call bindOne() instead.") + elif(hasattr(valueToBind, "_get_object_id")): conf.setExternalVariableValue(name, valueToBind); else: conf.setExternalVariableValue(name, valueToBind._jdf); return self; def bindOne(self, name: str, value): - return self.bind(name, [value]) + return self.bind(name, (value,)) def bindDataFrameAsVariable(self, name: str, df): conf = self._jrumblesession.getConfiguration(); From ba7c917bd52fdd4bc98b885b4a658a3818c28219 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:28:23 +0200 Subject: [PATCH 2/8] Extend release notes. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f8b64bb..8da1fa5 100644 --- a/README.md +++ b/README.md @@ -346,6 +346,7 @@ Even more queries can be found [here](https://colab.research.google.com/github/R - Allow to bind JSONiq variables to pandas dataframes - Allow to retrieve the output of a JSONiq query as a pandas dataframes (if the output is available as a dataframe, i.e., availableOutputs() returns a list that contains "DataFrame") - Clean up the mapping to strictly map tuples to sequence of items, and lists ot array items. This will avoid confusion between arrays and sequences. +- As a consequence, json() now returns a tuple, not a list. ## Version 0.1.0 alpha 12 - Allow to bind JSONiq variables to Python values (mapping Python lists to sequences of items). This makes it possible to manipulate Python values directly with JSONiq and even without any knowledge of Spark at all. From cc3ac11e46af1c5d107f13b06333c1985e31ad1a Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:29:56 +0200 Subject: [PATCH 3/8] Extend release notes. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8da1fa5..9a55629 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,7 @@ Even more queries can be found [here](https://colab.research.google.com/github/R - Allow to retrieve the output of a JSONiq query as a pandas dataframes (if the output is available as a dataframe, i.e., availableOutputs() returns a list that contains "DataFrame") - Clean up the mapping to strictly map tuples to sequence of items, and lists ot array items. This will avoid confusion between arrays and sequences. - As a consequence, json() now returns a tuple, not a list. +- Calling bind() with a single list will return an informative error. Use bind() with a tuple instead, or call bindOne() to interpret the list as a sequence of one array item. ## Version 0.1.0 alpha 12 - Allow to bind JSONiq variables to Python values (mapping Python lists to sequences of items). This makes it possible to manipulate Python values directly with JSONiq and even without any knowledge of Spark at all. From cbbae51c483eb05b5f89eb289e09f53835af5fec Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:31:50 +0200 Subject: [PATCH 4/8] Fix typo. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9a55629..a096c09 100644 --- a/README.md +++ b/README.md @@ -344,7 +344,7 @@ Even more queries can be found [here](https://colab.research.google.com/github/R ## Version 0.1.0 alpha 13 - Allow to bind JSONiq variables to pandas dataframes -- Allow to retrieve the output of a JSONiq query as a pandas dataframes (if the output is available as a dataframe, i.e., availableOutputs() returns a list that contains "DataFrame") +- Allow to retrieve the output of a JSONiq query as a pandas dataframe (if the output is available as a dataframe, i.e., availableOutputs() returns a list that contains "DataFrame") - Clean up the mapping to strictly map tuples to sequence of items, and lists ot array items. This will avoid confusion between arrays and sequences. - As a consequence, json() now returns a tuple, not a list. - Calling bind() with a single list will return an informative error. Use bind() with a tuple instead, or call bindOne() to interpret the list as a sequence of one array item. From 23acc3ec2bf188ec313c4c71b9005f14cbb5467d Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:33:51 +0200 Subject: [PATCH 5/8] Make error message more verbose. --- src/jsoniq/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py index 28f6085..b1e8568 100644 --- a/src/jsoniq/session.py +++ b/src/jsoniq/session.py @@ -132,7 +132,7 @@ def bind(self, name: str, valueToBind): elif isinstance(valueToBind, tuple): conf.setExternalVariableValue(name, self.convert(valueToBind)) elif isinstance(valueToBind, list): - raise ValueError("To avoid confusion, a sequence of items must be provided as a Python tuple, not as a Python list. Lists are mapped to single array items, while tuples are mapped to sequences of items. If you want to bind the variable to one array item, then you need to wrap the provided list inside a singleton tuple and try again, or you can also call bindOne() instead.") + raise ValueError("To avoid confusion, a sequence of items must be provided as a Python tuple, not as a Python list. Lists are mapped to single array items, while tuples are mapped to sequences of items. If you want to interpret the list as a sequence of items (one item for each list member), then you need to change this list to a tuple by wrapping it into a tuple() call. If you want to bind the variable to one array item, then you need to wrap the provided list inside a singleton tuple and try again, or you can also call bindOne() instead.") elif(hasattr(valueToBind, "_get_object_id")): conf.setExternalVariableValue(name, valueToBind); else: From e874a4a9887121b989bd9cdde518d8112d138d62 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:39:06 +0200 Subject: [PATCH 6/8] Improve the documentation. --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a096c09..e6ae857 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,10 @@ Any feedback or error reports are very welcome. ## Type mapping -When passing Python values to JSONiq or getting them from a JSONiq queries, the mapping is as follows: +Any expression in JSONiq returns a sequence of items. Any variable in JSONiq is bound to a sequence of items. +Items can be objects, arrays, or atomic values (strings, integers, booleans, nulls, dates, binary, durations, doubles, decimal numbers, etc). + +When passing Python values to JSONiq or getting them from a JSONiq queries, the mapping to and from Python is as follows: | Python | JSONiq | |-------|-------| From 67685e89fd7c17955a590484dd466c229d175bd8 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:41:38 +0200 Subject: [PATCH 7/8] Improve documentation. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index e6ae857..5fa35a6 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ Any feedback or error reports are very welcome. Any expression in JSONiq returns a sequence of items. Any variable in JSONiq is bound to a sequence of items. Items can be objects, arrays, or atomic values (strings, integers, booleans, nulls, dates, binary, durations, doubles, decimal numbers, etc). +A sequence of items can be a sequence of just one item, but it can also be empty, or it can be as large as to contain millions, billions or even trillions of items. Obviously, for sequence longer than a billion items, it is a better idea to use a cluster than a laptop. +A relational table (or more generally a data frame) corresponds to a sequence of object items sharing the same schema. However, sequences of items are more general than tables or data frames and support heterogeneity seamlessly. When passing Python values to JSONiq or getting them from a JSONiq queries, the mapping to and from Python is as follows: From 1f4adc97e498f978db3dddb0abb92d8e90bc9b67 Mon Sep 17 00:00:00 2001 From: Ghislain Fourny Date: Tue, 22 Jul 2025 10:44:28 +0200 Subject: [PATCH 8/8] Bump version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dfb628a..857b96d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "jsoniq" -version = "0.1.0a12" +version = "0.2.0a1" description = "Python edition of RumbleDB, a JSONiq engine" requires-python = ">=3.11" dependencies = [