From 4f1e2f48ea35adae623a03cb00ebbaac383799a8 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 13:02:43 +0800 Subject: [PATCH 1/7] support VARIANT for pypaimon --- docs/content/pypaimon/python-api.md | 218 ++- .../java/org/apache/paimon/JavaPyE2ETest.java | 183 +++ paimon-python/dev/run_mixed_tests.sh | 79 +- .../pypaimon/common/options/core_options.py | 22 + .../pypaimon/data/generic_variant.py | 872 ++++++++++++ .../pypaimon/data/variant_shredding.py | 979 ++++++++++++++ .../read/reader/format_pyarrow_reader.py | 96 +- paimon-python/pypaimon/schema/data_types.py | 29 + .../tests/e2e/java_py_read_write_test.py | 193 +++ paimon-python/pypaimon/tests/variant_test.py | 1188 +++++++++++++++++ .../pypaimon/write/writer/data_blob_writer.py | 2 + .../pypaimon/write/writer/data_writer.py | 67 +- 12 files changed, 3894 insertions(+), 34 deletions(-) create mode 100644 paimon-python/pypaimon/data/generic_variant.py create mode 100644 paimon-python/pypaimon/data/variant_shredding.py create mode 100644 paimon-python/pypaimon/tests/variant_test.py diff --git a/docs/content/pypaimon/python-api.md b/docs/content/pypaimon/python-api.md index 197a018ef1f7..7a73b317dd8c 100644 --- a/docs/content/pypaimon/python-api.md +++ b/docs/content/pypaimon/python-api.md @@ -687,22 +687,208 @@ Row kind values: ## Data Types -| Python Native Type | PyArrow Type | Paimon Type | -|:--------------------|:-------------------------------------------------|:----------------------------------| -| `int` | `pyarrow.int8()` | `TINYINT` | -| `int` | `pyarrow.int16()` | `SMALLINT` | -| `int` | `pyarrow.int32()` | `INT` | -| `int` | `pyarrow.int64()` | `BIGINT` | -| `float` | `pyarrow.float32()` | `FLOAT` | -| `float` | `pyarrow.float64()` | `DOUBLE` | -| `bool` | `pyarrow.bool_()` | `BOOLEAN` | -| `str` | `pyarrow.string()` | `STRING`, `CHAR(n)`, `VARCHAR(n)` | -| `bytes` | `pyarrow.binary()` | `BYTES`, `VARBINARY(n)` | -| `bytes` | `pyarrow.binary(length)` | `BINARY(length)` | -| `decimal.Decimal` | `pyarrow.decimal128(precision, scale)` | `DECIMAL(precision, scale)` | -| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)` | `TIMESTAMP(p)` | -| `datetime.date` | `pyarrow.date32()` | `DATE` | -| `datetime.time` | `pyarrow.time32(unit)` or `pyarrow.time64(unit)` | `TIME(p)` | +### Scalar Types + +| Python Native Type | PyArrow Type | Paimon Type | +|:--------------------|:---------------------------------------|:----------------------------------| +| `int` | `pyarrow.int8()` | `TINYINT` | +| `int` | `pyarrow.int16()` | `SMALLINT` | +| `int` | `pyarrow.int32()` | `INT` | +| `int` | `pyarrow.int64()` | `BIGINT` | +| `float` | `pyarrow.float32()` | `FLOAT` | +| `float` | `pyarrow.float64()` | `DOUBLE` | +| `bool` | `pyarrow.bool_()` | `BOOLEAN` | +| `str` | `pyarrow.string()` | `STRING`, `CHAR(n)`, `VARCHAR(n)` | +| `bytes` | `pyarrow.binary()` | `BYTES`, `VARBINARY(n)` | +| `bytes` | `pyarrow.binary(length)` | `BINARY(length)` | +| `bytes` | `pyarrow.large_binary()` | `BLOB` | +| `decimal.Decimal` | `pyarrow.decimal128(precision, scale)` | `DECIMAL(precision, scale)` | +| `datetime.datetime` | `pyarrow.timestamp(unit, tz=None)` | `TIMESTAMP(p)` — unit: `'s'` p=0, `'ms'` p=1–3, `'us'` p=4–6, `'ns'` p=7–9 | +| `datetime.datetime` | `pyarrow.timestamp(unit, tz='UTC')` | `TIMESTAMP_LTZ(p)` — same unit/p mapping as above | +| `datetime.date` | `pyarrow.date32()` | `DATE` | +| `datetime.time` | `pyarrow.time32('ms')` | `TIME(p)` | + +### Complex Types + +| Python Native Type | PyArrow Type | Paimon Type | +|:-------------------|:--------------------------------------|:-----------------------| +| `list` | `pyarrow.list_(element_type)` | `ARRAY` | +| `dict` | `pyarrow.map_(key_type, value_type)` | `MAP` | +| `dict` | `pyarrow.struct([field, ...])` | `ROW` | + +### VARIANT Type + +`VARIANT` stores semi-structured, schema-flexible data (JSON objects, arrays, and primitives) +in the [Parquet Variant binary encoding](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md). + +pypaimon exposes VARIANT columns as Arrow `struct` and +provides `GenericVariant` for encoding, decoding, and path extraction. + +Paimon supports two Parquet storage layouts for VARIANT: + +- **Plain VARIANT** — the standard two-field struct (`value` + `metadata`). Default for all writes. +- **Shredded VARIANT** — typed sub-columns are stored alongside overflow bytes, enabling column-skipping + inside the Parquet file. Controlled by the `variant.shreddingSchema` table option. + +{{< tabs "variant-read-write" >}} +{{< tab "Plain VARIANT" >}} + +**Read** + +A VARIANT column arrives as `struct` in every Arrow batch. +Use `GenericVariant.from_arrow_struct` to decode each row: + +```python +from pypaimon.data.generic_variant import GenericVariant + +read_builder = table.new_read_builder() +result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits()) + +for record in result.to_pylist(): + if (payload := record["payload"]) is not None: + gv = GenericVariant.from_arrow_struct(payload) + print(gv.to_python()) # decode to Python dict / list / scalar + print(gv.to_json()) # decode to JSON string +``` + +`from_arrow_struct` is a lightweight operation — it only wraps the two raw byte arrays without +parsing them. Actual variant binary decoding is deferred to `to_python()` / `to_json()`. + +**Write** + +Build `GenericVariant` values and convert them to an Arrow column with `to_arrow_array`: + +```python +import pyarrow as pa +from pypaimon.data.generic_variant import GenericVariant + +gv1 = GenericVariant.from_json('{"city": "Beijing", "age": 30}') +gv2 = GenericVariant.from_python({'tags': [1, 2, 3], 'active': True}) +# None represents SQL NULL + +data = pa.table({ + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'payload': GenericVariant.to_arrow_array([gv1, gv2, None]), +}) + +write_builder = table.new_batch_write_builder() +table_write = write_builder.new_write() +table_commit = write_builder.new_commit() +table_write.write_arrow(data) +table_commit.commit(table_write.prepare_commit()) +table_write.close() +table_commit.close() +``` + +{{< /tab >}} +{{< tab "Shredded VARIANT" >}} + +In shredded mode the VARIANT column is physically split inside Parquet into a three-field group: + +``` +payload (GROUP) +├── metadata BYTE_ARRAY -- key dictionary (always present) +├── value BYTE_ARRAY OPTIONAL -- overflow bytes for un-shredded fields +└── typed_value (GROUP) OPTIONAL + ├── age (GROUP) + │ ├── value BYTE_ARRAY OPTIONAL + │ └── typed_value INT64 OPTIONAL + └── city (GROUP) + ├── value BYTE_ARRAY OPTIONAL + └── typed_value BYTE_ARRAY OPTIONAL +``` + +**Read — automatic reassembly** + +When pypaimon reads a Parquet file that contains shredded VARIANT columns (whether written by Paimon Java +or by pypaimon with shredding enabled), it **automatically detects and reassembles** them back to the +standard `struct` form before returning any batch. No code changes are needed on the +read side: + +```python +from pypaimon.data.generic_variant import GenericVariant + +# Works identically for both shredded and plain Parquet files +read_builder = table.new_read_builder() +result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits()) + +for record in result.to_pylist(): + if (payload := record["payload"]) is not None: + gv = GenericVariant.from_arrow_struct(payload) # same API as plain VARIANT + print(gv.to_python()) +``` + +Reassembly (reconstructing the variant binary from `typed_value` sub-columns and overflow bytes) +happens inside `FormatPyArrowReader.read_arrow_batch()` — that is, **at batch read time**, before +the Arrow data is returned to the caller. Note: When sub-field projection is active +(`with_variant_sub_fields`), reassembly is skipped entirely and only the requested typed +sub-columns are decoded. + +**Write — shredding mode** + +Set the `variant.shreddingSchema` table option to a JSON-encoded `ROW` type that describes which +sub-fields of which VARIANT columns to shred. The top-level fields map VARIANT column names to their +sub-field schemas: + +```python +import json + +shredding_schema = json.dumps({ + "type": "ROW", + "fields": [ + { + "id": 0, + "name": "payload", # VARIANT column name in the table + "type": { + "type": "ROW", + "fields": [ # sub-fields to extract as typed columns + {"id": 0, "name": "age", "type": "BIGINT"}, + {"id": 1, "name": "city", "type": "VARCHAR"}, + ] + } + } + ] +}) + +# Pass the option when creating the table +schema = Schema.from_pyarrow_schema( + pa_schema, + options={'variant.shreddingSchema': shredding_schema} +) +catalog.create_table('db.events', schema, ignore_if_exists=True) +``` + +Once the option is set, each `write_arrow` call transparently converts VARIANT columns to the shredded +Parquet layout. The read path — including Java Paimon and other engines — can then exploit the typed +sub-columns for column-skipping via sub-field projection. + +Fields not listed in `variant.shreddingSchema` are stored in the overflow `value` bytes and remain +fully accessible on the read path. + +Supported Paimon type strings for shredded sub-fields: `BOOLEAN`, `INT`, `BIGINT`, `FLOAT`, `DOUBLE`, +`VARCHAR`, `DECIMAL(p,s)`, and nested `ROW` types for recursive object shredding. + +{{< /tab >}} +{{< /tabs >}} + + +**`GenericVariant` API:** + +| Method | Description | +|:-------|:------------| +| `GenericVariant.from_json(json_str)` | Build from a JSON string | +| `GenericVariant.from_python(obj)` | Build from a Python object (`dict`, `list`, `int`, `str`, …) | +| `GenericVariant.from_arrow_struct({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row (read path) | +| `GenericVariant.to_arrow_array([gv1, gv2, None, ...])` | Convert a list of `GenericVariant` (or `None`) to a `pa.StructArray` for writing | +| `gv.to_python()` | Decode to native Python (`dict`, `list`, `int`, `str`, `None`, …) | +| `gv.to_json()` | Decode to a JSON string | +| `gv.value()` | Return raw value bytes | +| `gv.metadata()` | Return raw metadata bytes | + +**Limitations:** + +- `VARIANT` is only supported with Parquet file format. Writing to ORC or Avro raises `NotImplementedError`. +- `VARIANT` cannot be used as a primary key or partition key. ## Predicate diff --git a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java index c09bf3466384..dbe000401f59 100644 --- a/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java +++ b/paimon-core/src/test/java/org/apache/paimon/JavaPyE2ETest.java @@ -28,6 +28,7 @@ import org.apache.paimon.data.DataFormatTestUtil; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.variant.GenericVariant; import org.apache.paimon.disk.IOManager; import org.apache.paimon.fs.FileIOFinder; import org.apache.paimon.fs.Path; @@ -941,6 +942,188 @@ protected GenericRow createRow3ColsWithKind(RowKind rowKind, Object... values) { return GenericRow.ofKind(rowKind, values[0], values[1], values[2]); } + /** Java writes a VARIANT-column table for Python to read (Java→Python E2E). */ + @Test + @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") + public void testJavaWriteVariantTable() throws Exception { + Identifier identifier = identifier("variant_test"); + catalog.dropTable(identifier, true); + Schema schema = + Schema.newBuilder() + .column("id", DataTypes.INT()) + .column("name", DataTypes.STRING()) + .column("payload", DataTypes.VARIANT()) + .option("bucket", "-1") + .build(); + catalog.createTable(identifier, schema, false); + + FileStoreTable table = (FileStoreTable) catalog.getTable(identifier); + BatchWriteBuilder writeBuilder = table.newBatchWriteBuilder(); + try (BatchTableWrite write = writeBuilder.newWrite(); + BatchTableCommit commit = writeBuilder.newCommit()) { + write.write( + GenericRow.of( + 1, + BinaryString.fromString("Alice"), + GenericVariant.fromJson("{\"age\":30,\"city\":\"Beijing\"}"))); + write.write( + GenericRow.of( + 2, + BinaryString.fromString("Bob"), + GenericVariant.fromJson("{\"age\":25,\"city\":\"Shanghai\"}"))); + write.write( + GenericRow.of( + 3, + BinaryString.fromString("Carol"), + GenericVariant.fromJson("[1,2,3]"))); + commit.commit(write.prepareCommit()); + } + + // Verify Java can read back what it wrote + FileStoreTable readTable = (FileStoreTable) catalog.getTable(identifier); + List splits = new ArrayList<>(readTable.newSnapshotReader().read().dataSplits()); + TableRead read = readTable.newRead(); + List res = + getResult(read, splits, row -> internalRowToString(row, readTable.rowType())); + assertThat(res).hasSize(3); + LOG.info("testJavaWriteVariantTable: wrote and read back {} VARIANT rows", res.size()); + + // Also write a shredded VARIANT table for Python to read (variant_shredded_test). + // The shredding schema shreds the 'age' (BIGINT) and 'city' (VARCHAR) sub-fields + // of the 'payload' column so Python can exercise the shredded-read path. + String shreddingJson = + "{\"type\":\"ROW\",\"fields\":[{\"name\":\"payload\",\"type\":" + + "{\"type\":\"ROW\",\"fields\":[" + + "{\"name\":\"age\",\"type\":\"BIGINT\"}," + + "{\"name\":\"city\",\"type\":\"VARCHAR\"}" + + "]}}]}"; + Identifier shreddedId = identifier("variant_shredded_test"); + catalog.dropTable(shreddedId, true); + Schema shreddedSchema = + Schema.newBuilder() + .column("id", DataTypes.INT()) + .column("name", DataTypes.STRING()) + .column("payload", DataTypes.VARIANT()) + .option("bucket", "-1") + .option("parquet.variant.shreddingSchema", shreddingJson) + .build(); + catalog.createTable(shreddedId, shreddedSchema, false); + + FileStoreTable shreddedTable = (FileStoreTable) catalog.getTable(shreddedId); + BatchWriteBuilder shreddedWriteBuilder = shreddedTable.newBatchWriteBuilder(); + try (BatchTableWrite shreddedWrite = shreddedWriteBuilder.newWrite(); + BatchTableCommit shreddedCommit = shreddedWriteBuilder.newCommit()) { + shreddedWrite.write( + GenericRow.of( + 1, + BinaryString.fromString("Alice"), + GenericVariant.fromJson("{\"age\":30,\"city\":\"Beijing\"}"))); + shreddedWrite.write( + GenericRow.of( + 2, + BinaryString.fromString("Bob"), + GenericVariant.fromJson("{\"age\":25,\"city\":\"Shanghai\"}"))); + shreddedWrite.write( + GenericRow.of( + 3, + BinaryString.fromString("Carol"), + GenericVariant.fromJson("[1,2,3]"))); + shreddedCommit.commit(shreddedWrite.prepareCommit()); + } + + FileStoreTable shreddedReadTable = (FileStoreTable) catalog.getTable(shreddedId); + List shreddedSplits = + new ArrayList<>(shreddedReadTable.newSnapshotReader().read().dataSplits()); + List shreddedRes = + getResult( + shreddedReadTable.newRead(), + shreddedSplits, + row -> internalRowToString(row, shreddedReadTable.rowType())); + assertThat(shreddedRes).hasSize(3); + LOG.info( + "testJavaWriteVariantTable: wrote shredded VARIANT table '{}' with {} rows", + shreddedId.getTableName(), + shreddedRes.size()); + } + + /** Java reads a VARIANT-column table written by Python (Python→Java E2E). */ + @Test + @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") + public void testJavaReadVariantTable() throws Exception { + Identifier identifier = identifier("py_variant_test"); + FileStoreTable table = (FileStoreTable) catalog.getTable(identifier); + List splits = new ArrayList<>(table.newSnapshotReader().read().dataSplits()); + TableRead read = table.newRead(); + List res = + getResult(read, splits, row -> internalRowToString(row, table.rowType())); + assertThat(res).hasSize(4); + + // Verify the VARIANT column is present in the schema + assertThat(table.rowType().getFieldNames()).contains("payload"); + assertThat(table.rowType().getTypeAt(table.rowType().getFieldIndex("payload"))) + .isEqualTo(DataTypes.VARIANT()); + + // Verify each row's VARIANT payload can be decoded by Java + List splits2 = new ArrayList<>(table.newSnapshotReader().read().dataSplits()); + try (org.apache.paimon.reader.RecordReader reader = + read.createReader(splits2)) { + reader.forEachRemaining( + row -> { + int id = row.getInt(0); + if (id == 4) { + // null payload + assertThat(row.isNullAt(2)).isTrue(); + } else { + assertThat(row.isNullAt(2)).isFalse(); + org.apache.paimon.data.variant.Variant v = row.getVariant(2); + assertThat(v).isNotNull(); + } + }); + } + LOG.info( + "testJavaReadVariantTable: Java read {} VARIANT rows written by Python", + res.size()); + + // Also read the shredded VARIANT table written by Python (py_variant_shredded_test). + // Python writes with variant.shreddingSchema to produce shredded Parquet; Java must + // reassemble the shredded sub-columns back into VARIANT values. + Identifier shreddedId = identifier("py_variant_shredded_test"); + FileStoreTable shreddedTable = (FileStoreTable) catalog.getTable(shreddedId); + List shreddedSplits = + new ArrayList<>(shreddedTable.newSnapshotReader().read().dataSplits()); + TableRead shreddedRead = shreddedTable.newRead(); + List shreddedRes = + getResult( + shreddedRead, + shreddedSplits, + row -> internalRowToString(row, shreddedTable.rowType())); + assertThat(shreddedRes).hasSize(3); + + // Schema check: the logical type must still be VARIANT + assertThat(shreddedTable.rowType().getFieldNames()).contains("payload"); + assertThat( + shreddedTable + .rowType() + .getTypeAt(shreddedTable.rowType().getFieldIndex("payload"))) + .isEqualTo(DataTypes.VARIANT()); + + // Verify each row's VARIANT can be decoded + List shreddedSplits2 = + new ArrayList<>(shreddedTable.newSnapshotReader().read().dataSplits()); + try (org.apache.paimon.reader.RecordReader reader = + shreddedRead.createReader(shreddedSplits2)) { + reader.forEachRemaining( + row -> { + assertThat(row.isNullAt(2)).isFalse(); + org.apache.paimon.data.variant.Variant v = row.getVariant(2); + assertThat(v).isNotNull(); + }); + } + LOG.info( + "testJavaReadVariantTable: Java read {} shredded VARIANT rows written by Python", + shreddedRes.size()); + } + /** Step 1: Write 5 base files for compact conflict test. */ @Test @EnabledIfSystemProperty(named = "run.e2e.tests", matches = "true") diff --git a/paimon-python/dev/run_mixed_tests.sh b/paimon-python/dev/run_mixed_tests.sh index 077b5af27664..2067d1813eab 100755 --- a/paimon-python/dev/run_mixed_tests.sh +++ b/paimon-python/dev/run_mixed_tests.sh @@ -339,6 +339,55 @@ run_blob_alter_compact_test() { fi } +# Function to run VARIANT test (Java write, Python read) +run_java_variant_write_py_read_test() { + echo -e "${YELLOW}=== Running VARIANT Test (Java Write, Python Read) ===${NC}" + + cd "$PROJECT_ROOT" + + echo "Running Maven test for JavaPyE2ETest.testJavaWriteVariantTable..." + if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testJavaWriteVariantTable -pl paimon-core -q -Drun.e2e.tests=true; then + echo -e "${GREEN}✓ Java VARIANT write test completed successfully${NC}" + else + echo -e "${RED}✗ Java VARIANT write test failed${NC}" + return 1 + fi + cd "$PAIMON_PYTHON_DIR" + echo "Running Python test for JavaPyReadWriteTest.test_py_read_variant_table..." + if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_py_read_variant_table -v; then + echo -e "${GREEN}✓ Python VARIANT read test completed successfully${NC}" + return 0 + else + echo -e "${RED}✗ Python VARIANT read test failed${NC}" + return 1 + fi +} + +# Function to run VARIANT test (Python write, Java read) +run_py_variant_write_java_read_test() { + echo -e "${YELLOW}=== Running VARIANT Test (Python Write, Java Read) ===${NC}" + + cd "$PAIMON_PYTHON_DIR" + echo "Running Python test for JavaPyReadWriteTest.test_py_write_variant_table..." + if ! python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_py_write_variant_table -v; then + echo -e "${RED}✗ Python VARIANT write test failed${NC}" + return 1 + fi + echo -e "${GREEN}✓ Python VARIANT write test completed successfully${NC}" + + echo "" + + cd "$PROJECT_ROOT" + echo "Running Maven test for JavaPyE2ETest.testJavaReadVariantTable..." + if mvn test -Dtest=org.apache.paimon.JavaPyE2ETest#testJavaReadVariantTable -pl paimon-core -q -Drun.e2e.tests=true; then + echo -e "${GREEN}✓ Java VARIANT read test completed successfully${NC}" + return 0 + else + echo -e "${RED}✗ Java VARIANT read test failed${NC}" + return 1 + fi +} + # Main execution main() { local java_write_result=0 @@ -352,6 +401,8 @@ main() { local lumina_vector_result=0 local compact_conflict_result=0 local blob_alter_compact_result=0 + local java_variant_write_py_read_result=0 + local py_variant_write_java_read_result=0 # Detect Python version PYTHON_VERSION=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "unknown") @@ -448,6 +499,20 @@ main() { echo "" + # Run VARIANT type test (Java write, Python read) + if ! run_java_variant_write_py_read_test; then + java_variant_write_py_read_result=1 + fi + + echo "" + + # Run VARIANT Python-write Java-read test + if ! run_py_variant_write_java_read_test; then + py_variant_write_java_read_result=1 + fi + + echo "" + echo -e "${YELLOW}=== Test Results Summary ===${NC}" if [[ $java_write_result -eq 0 ]]; then @@ -516,12 +581,24 @@ main() { echo -e "${RED}✗ Blob Alter+Compact Test (Java Write+Alter+Compact, Python Read): FAILED${NC}" fi + if [[ $java_variant_write_py_read_result -eq 0 ]]; then + echo -e "${GREEN}✓ VARIANT Type Test (Java Write, Python Read): PASSED${NC}" + else + echo -e "${RED}✗ VARIANT Type Test (Java Write, Python Read): FAILED${NC}" + fi + + if [[ $py_variant_write_java_read_result -eq 0 ]]; then + echo -e "${GREEN}✓ VARIANT Type Test (Python Write, Java Read): PASSED${NC}" + else + echo -e "${RED}✗ VARIANT Type Test (Python Write, Java Read): FAILED${NC}" + fi + echo "" # Clean up warehouse directory after all tests cleanup_warehouse - if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 && $lumina_vector_result -eq 0 && $compact_conflict_result -eq 0 && $blob_alter_compact_result -eq 0 ]]; then + if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 && $lumina_vector_result -eq 0 && $compact_conflict_result -eq 0 && $blob_alter_compact_result -eq 0 && $java_variant_write_py_read_result -eq 0 && $py_variant_write_java_read_result -eq 0 ]]; then echo -e "${GREEN}🎉 All tests passed! Java-Python interoperability verified.${NC}" return 0 else diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py index 8b84e6d38d73..fe3ba668abc0 100644 --- a/paimon-python/pypaimon/common/options/core_options.py +++ b/paimon-python/pypaimon/common/options/core_options.py @@ -405,6 +405,21 @@ class CoreOptions: ) ) + VARIANT_SHREDDING_SCHEMA: ConfigOption[str] = ( + ConfigOptions.key("variant.shreddingSchema") + .string_type() + .no_default_value() + .with_description( + "JSON-encoded ROW type specifying which VARIANT sub-fields to shred when " + "writing Parquet (static shredding mode). The top-level fields map VARIANT " + "column names to their sub-field schemas. " + "Alias: 'parquet.variant.shreddingSchema'. " + "Example: '{\"type\":\"ROW\",\"fields\":[{\"id\":0,\"name\":\"payload\"," + "\"type\":{\"type\":\"ROW\",\"fields\":[{\"id\":0,\"name\":\"age\"," + "\"type\":\"BIGINT\"}]}}]}'" + ) + ) + PARTITION_DEFAULT_NAME: ConfigOption[str] = ( ConfigOptions.key("partition.default-name") .string_type() @@ -480,6 +495,13 @@ def metadata_stats_enabled(self, default=None): def blob_as_descriptor(self, default=None): return self.options.get(CoreOptions.BLOB_AS_DESCRIPTOR, default) + def variant_shredding_schema(self) -> Optional[str]: + val = self.options.get(CoreOptions.VARIANT_SHREDDING_SCHEMA) + if val is None: + # Support alias used by Java: parquet.variant.shreddingSchema + val = self.options.data.get("parquet.variant.shreddingSchema") + return val + def blob_descriptor_fields(self, default=None): value = self.options.get(CoreOptions.BLOB_DESCRIPTOR_FIELD, default) if value is None: diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py new file mode 100644 index 000000000000..08ba51def448 --- /dev/null +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -0,0 +1,872 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +################################################################################ + +"""Python implementation of Paimon GenericVariant — a thin storage container. + +Paimon's VARIANT type is stored as two byte arrays: + value – encodes the structure and leaf values (Parquet Variant binary spec) + metadata – key dictionary for object field names + +pypaimon exposes VARIANT columns as Arrow struct. This class is a convenience wrapper that lets +Python code construct (for writing) or inspect (for debugging) those bytes. + +Variant semantics such as path extraction, type casting, and JSONPath queries +are the responsibility of the compute engine or application layer, not of +pypaimon (the storage layer). + +Primary entry points: + GenericVariant.from_json(json_str) – build from a JSON string (for writing) + GenericVariant.from_python(obj) – build from a Python object (for writing) + GenericVariant(value, metadata) – wrap raw bytes read from a VARIANT column + GenericVariant.from_arrow_struct(d) – wrap a row dict from a VARIANT Arrow column + GenericVariant.to_arrow_array(variants)– convert a list to a PyArrow StructArray + +Inspection helpers (for debugging/testing): + v.to_json() – decode back to a JSON string + v.to_python() – decode to native Python objects + v.value() – raw value bytes + v.metadata() – raw metadata bytes +""" + +import base64 +import datetime +import decimal as _decimal +import enum +import json as _json +import struct + +# --------------------------------------------------------------------------- +# Constants (matching GenericVariantUtil.java) +# --------------------------------------------------------------------------- + +_PRIMITIVE = 0 +_SHORT_STR = 1 +_OBJECT = 2 +_ARRAY = 3 + +_NULL = 0 +_TRUE = 1 +_FALSE = 2 +_INT1 = 3 +_INT2 = 4 +_INT4 = 5 +_INT8 = 6 +_DOUBLE = 7 +_DECIMAL4 = 8 +_DECIMAL8 = 9 +_DECIMAL16 = 10 +_DATE = 11 +_TIMESTAMP = 12 +_TIMESTAMP_NTZ = 13 +_FLOAT = 14 +_BINARY = 15 +_LONG_STR = 16 +_UUID = 20 + +_VERSION = 1 +_VERSION_MASK = 0x0F +_SIZE_LIMIT = 128 * 1024 * 1024 +_MAX_SHORT_STR_SIZE = 0x3F # 63 +_U8_MAX = 255 +_U16_MAX = 65535 +_U24_MAX = 16777215 +_U32_SIZE = 4 +_MAX_DECIMAL4_PRECISION = 9 +_MAX_DECIMAL8_PRECISION = 18 +_MAX_DECIMAL16_PRECISION = 38 + +# Epoch for date/timestamp conversions (used by to_json / to_python) +_EPOCH_DATE = datetime.date(1970, 1, 1) +_EPOCH_DT_UTC = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) +_EPOCH_DT_NTZ = datetime.datetime(1970, 1, 1) + + +class _Type(enum.Enum): + """Internal high-level variant value types (many-to-one from wire types).""" + OBJECT = 'OBJECT' + ARRAY = 'ARRAY' + NULL = 'NULL' + BOOLEAN = 'BOOLEAN' + LONG = 'LONG' + STRING = 'STRING' + DOUBLE = 'DOUBLE' + DECIMAL = 'DECIMAL' + DATE = 'DATE' + TIMESTAMP = 'TIMESTAMP' + TIMESTAMP_NTZ = 'TIMESTAMP_NTZ' + FLOAT = 'FLOAT' + BINARY = 'BINARY' + UUID = 'UUID' + + +_PRIMITIVE_TYPE_MAP = { + _NULL: _Type.NULL, + _TRUE: _Type.BOOLEAN, _FALSE: _Type.BOOLEAN, + _INT1: _Type.LONG, _INT2: _Type.LONG, _INT4: _Type.LONG, _INT8: _Type.LONG, + _DOUBLE: _Type.DOUBLE, + _DECIMAL4: _Type.DECIMAL, _DECIMAL8: _Type.DECIMAL, _DECIMAL16: _Type.DECIMAL, + _DATE: _Type.DATE, + _TIMESTAMP: _Type.TIMESTAMP, + _TIMESTAMP_NTZ: _Type.TIMESTAMP_NTZ, + _FLOAT: _Type.FLOAT, + _BINARY: _Type.BINARY, + _LONG_STR: _Type.STRING, + _UUID: _Type.UUID, +} +_PRIMITIVE_FIXED_SIZES = { + _NULL: 1, _TRUE: 1, _FALSE: 1, + _INT1: 2, _INT2: 3, _INT4: 5, _INT8: 9, + _DOUBLE: 9, _FLOAT: 5, _DATE: 5, + _TIMESTAMP: 9, _TIMESTAMP_NTZ: 9, + _DECIMAL4: 6, _DECIMAL8: 10, _DECIMAL16: 18, + _UUID: 17, +} +_LONG_FAMILY_SIZES = { + _INT1: 1, _INT2: 2, _INT4: 4, _INT8: 8, + _DATE: 4, _TIMESTAMP: 8, _TIMESTAMP_NTZ: 8, +} + + +# --------------------------------------------------------------------------- +# Low-level binary utilities +# --------------------------------------------------------------------------- + +def _read_unsigned(data, pos, n): + return int.from_bytes(data[pos:pos + n], 'little', signed=False) + + +def _read_signed(data, pos, n): + return int.from_bytes(data[pos:pos + n], 'little', signed=True) + + +def _write_le(buf, pos, value, n): + buf[pos:pos + n] = value.to_bytes(n, 'little') + + +def _get_int_size(value): + if value <= _U8_MAX: + return 1 + if value <= _U16_MAX: + return 2 + if value <= _U24_MAX: + return 3 + return 4 + + +def _primitive_header(type_id): + return (type_id << 2) | _PRIMITIVE + + +def _short_str_header(size): + return (size << 2) | _SHORT_STR + + +def _object_header(large_size, id_size, offset_size): + return ( + ((1 if large_size else 0) << 6) + | ((id_size - 1) << 4) + | ((offset_size - 1) << 2) + | _OBJECT + ) + + +def _array_header(large_size, offset_size): + return ( + ((1 if large_size else 0) << 4) + | ((offset_size - 1) << 2) + | _ARRAY + ) + + +def _variant_get_type(value, pos): + b = value[pos] + basic_type = b & 0x3 + type_info = (b >> 2) & 0x3F + if basic_type == _SHORT_STR: + return _Type.STRING + if basic_type == _OBJECT: + return _Type.OBJECT + if basic_type == _ARRAY: + return _Type.ARRAY + t = _PRIMITIVE_TYPE_MAP.get(type_info) + if t is None: + raise ValueError(f'Unknown primitive variant type id: {type_info}') + return t + + +def _value_size(value, pos): + b = value[pos] + basic_type = b & 0x3 + type_info = (b >> 2) & 0x3F + if basic_type == _SHORT_STR: + return 1 + type_info + if basic_type == _OBJECT: + return _handle_object( + value, pos, + lambda size, id_size, offset_size, id_start, offset_start, data_start: ( + data_start - pos + _read_unsigned( + value, offset_start + size * offset_size, offset_size) + ) + ) + if basic_type == _ARRAY: + return _handle_array( + value, pos, + lambda size, offset_size, offset_start, data_start: ( + data_start - pos + _read_unsigned( + value, offset_start + size * offset_size, offset_size) + ) + ) + size = _PRIMITIVE_FIXED_SIZES.get(type_info) + if size is not None: + return size + if type_info in (_BINARY, _LONG_STR): + return 1 + _U32_SIZE + _read_unsigned(value, pos + 1, _U32_SIZE) + raise ValueError(f'Unknown primitive type id: {type_info}') + + +def _handle_object(value, pos, handler): + b = value[pos] + type_info = (b >> 2) & 0x3F + large_size = bool((type_info >> 4) & 0x1) + size_bytes = _U32_SIZE if large_size else 1 + size = _read_unsigned(value, pos + 1, size_bytes) + id_size = ((type_info >> 2) & 0x3) + 1 + offset_size = (type_info & 0x3) + 1 + id_start = pos + 1 + size_bytes + offset_start = id_start + size * id_size + data_start = offset_start + (size + 1) * offset_size + return handler(size, id_size, offset_size, id_start, offset_start, data_start) + + +def _handle_array(value, pos, handler): + b = value[pos] + type_info = (b >> 2) & 0x3F + large_size = bool((type_info >> 2) & 0x1) + size_bytes = _U32_SIZE if large_size else 1 + size = _read_unsigned(value, pos + 1, size_bytes) + offset_size = (type_info & 0x3) + 1 + offset_start = pos + 1 + size_bytes + data_start = offset_start + (size + 1) * offset_size + return handler(size, offset_size, offset_start, data_start) + + +def _get_metadata_key(metadata, key_id): + offset_size = ((metadata[0] >> 6) & 0x3) + 1 + dict_size = _read_unsigned(metadata, 1, offset_size) + if key_id >= dict_size: + raise ValueError('MALFORMED_VARIANT: key id out of range') + string_start = 1 + (dict_size + 2) * offset_size + offset = _read_unsigned(metadata, 1 + (key_id + 1) * offset_size, offset_size) + next_offset = _read_unsigned(metadata, 1 + (key_id + 2) * offset_size, offset_size) + return metadata[string_start + offset:string_start + next_offset].decode('utf-8') + + +# --------------------------------------------------------------------------- +# _GenericVariantBuilder (for from_json / from_python) +# --------------------------------------------------------------------------- + +class _GenericVariantBuilder: + """Builds a GenericVariant from Python values or JSON strings.""" + + def __init__(self): + self._buf = bytearray(128) + self._pos = 0 + self._dict = {} + self._keys = [] + + def _get_or_add_key(self, key): + if key not in self._dict: + kid = len(self._keys) + self._dict[key] = kid + self._keys.append(key.encode('utf-8')) + return self._dict[key] + + def _ensure(self, n): + needed = self._pos + n + if needed > len(self._buf): + new_cap = max(needed, len(self._buf) * 2) + new_buf = bytearray(new_cap) + new_buf[:self._pos] = self._buf[:self._pos] + self._buf = new_buf + + def _write_byte(self, b): + self._ensure(1) + self._buf[self._pos] = b & 0xFF + self._pos += 1 + + def _write_le(self, value, n): + self._ensure(n) + _write_le(self._buf, self._pos, value, n) + self._pos += n + + def append_null(self): + self._write_byte(_primitive_header(_NULL)) + + def append_boolean(self, b): + self._write_byte(_primitive_header(_TRUE if b else _FALSE)) + + def append_long(self, n): + if -(1 << 7) <= n < (1 << 7): + self._write_byte(_primitive_header(_INT1)) + self._write_le(n & 0xFF, 1) + elif -(1 << 15) <= n < (1 << 15): + self._write_byte(_primitive_header(_INT2)) + self._write_le(n & 0xFFFF, 2) + elif -(1 << 31) <= n < (1 << 31): + self._write_byte(_primitive_header(_INT4)) + self._write_le(n & 0xFFFFFFFF, 4) + else: + self._write_byte(_primitive_header(_INT8)) + self._write_le(n & 0xFFFFFFFFFFFFFFFF, 8) + + def append_double(self, d): + self._write_byte(_primitive_header(_DOUBLE)) + self._ensure(8) + struct.pack_into(' 0: + raise ValueError( + f'append_decimal requires a non-positive exponent (got {d!r}); ' + 'use append_double() for Decimal values with positive exponents' + ) + unscaled = int(''.join(str(x) for x in digits)) + if sign: + unscaled = -unscaled + scale = -exponent if exponent < 0 else 0 + precision = len(digits) + + if scale <= _MAX_DECIMAL4_PRECISION and precision <= _MAX_DECIMAL4_PRECISION: + self._write_byte(_primitive_header(_DECIMAL4)) + self._write_byte(scale) + self._write_le(unscaled & 0xFFFFFFFF, 4) + elif scale <= _MAX_DECIMAL8_PRECISION and precision <= _MAX_DECIMAL8_PRECISION: + self._write_byte(_primitive_header(_DECIMAL8)) + self._write_byte(scale) + self._write_le(unscaled & 0xFFFFFFFFFFFFFFFF, 8) + else: + self._write_byte(_primitive_header(_DECIMAL16)) + self._write_byte(scale) + self._ensure(16) + raw = unscaled.to_bytes(16, 'little', signed=True) + self._buf[self._pos:self._pos + 16] = raw + self._pos += 16 + + def append_string(self, s): + text = s.encode('utf-8') + if len(text) <= _MAX_SHORT_STR_SIZE: + self._write_byte(_short_str_header(len(text))) + else: + self._write_byte(_primitive_header(_LONG_STR)) + self._write_le(len(text), _U32_SIZE) + self._ensure(len(text)) + self._buf[self._pos:self._pos + len(text)] = text + self._pos += len(text) + + def append_binary(self, b): + self._write_byte(_primitive_header(_BINARY)) + self._write_le(len(b), _U32_SIZE) + self._ensure(len(b)) + self._buf[self._pos:self._pos + len(b)] = b + self._pos += len(b) + + def append_date(self, days_since_epoch): + self._write_byte(_primitive_header(_DATE)) + self._write_le(days_since_epoch & 0xFFFFFFFF, 4) + + def append_timestamp(self, micros_since_epoch): + self._write_byte(_primitive_header(_TIMESTAMP)) + self._write_le(micros_since_epoch & 0xFFFFFFFFFFFFFFFF, 8) + + def append_timestamp_ntz(self, micros_since_epoch): + self._write_byte(_primitive_header(_TIMESTAMP_NTZ)) + self._write_le(micros_since_epoch & 0xFFFFFFFFFFFFFFFF, 8) + + def _finish_writing_object(self, start, fields): + fields.sort(key=lambda f: f[0]) + for i in range(1, len(fields)): + if fields[i][0] == fields[i - 1][0]: + raise ValueError('Duplicate key in variant object') + + size = len(fields) + data_size = self._pos - start + large_size = size > _U8_MAX + size_bytes = _U32_SIZE if large_size else 1 + max_id = max((f[1] for f in fields), default=0) + id_size = _get_int_size(max_id) + offset_size = _get_int_size(data_size) + header_size = 1 + size_bytes + size * id_size + (size + 1) * offset_size + + self._ensure(header_size) + dst = start + header_size + src = start + self._buf[dst:dst + data_size] = self._buf[src:src + data_size] + self._pos += header_size + + self._buf[start] = _object_header(large_size, id_size, offset_size) + _write_le(self._buf, start + 1, size, size_bytes) + id_start = start + 1 + size_bytes + offset_start = id_start + size * id_size + for i, (_, fid, offset) in enumerate(fields): + _write_le(self._buf, id_start + i * id_size, fid, id_size) + _write_le(self._buf, offset_start + i * offset_size, offset, offset_size) + _write_le(self._buf, offset_start + size * offset_size, data_size, offset_size) + + def _finish_writing_array(self, start, offsets): + size = len(offsets) + data_size = self._pos - start + large_size = size > _U8_MAX + size_bytes = _U32_SIZE if large_size else 1 + offset_size = _get_int_size(data_size) + header_size = 1 + size_bytes + (size + 1) * offset_size + + self._ensure(header_size) + dst = start + header_size + self._buf[dst:dst + data_size] = self._buf[start:start + data_size] + self._pos += header_size + + self._buf[start] = _array_header(large_size, offset_size) + _write_le(self._buf, start + 1, size, size_bytes) + offset_start = start + 1 + size_bytes + for i, off in enumerate(offsets): + _write_le(self._buf, offset_start + i * offset_size, off, offset_size) + _write_le(self._buf, offset_start + size * offset_size, data_size, offset_size) + + def build_python(self, obj): + """Recursively encode a Python value into the variant binary buffer.""" + if obj is None: + self.append_null() + elif isinstance(obj, bool): + self.append_boolean(obj) + elif isinstance(obj, int): + self.append_long(obj) + elif isinstance(obj, float): + self.append_double(obj) + elif isinstance(obj, _decimal.Decimal): + self._try_decimal_or_double(obj) + elif isinstance(obj, str): + self.append_string(obj) + elif isinstance(obj, dict): + fields = [] + start = self._pos + for key, val in obj.items(): + fid = self._get_or_add_key(key) + offset = self._pos - start + fields.append((key, fid, offset)) + self.build_python(val) + self._finish_writing_object(start, fields) + elif isinstance(obj, (list, tuple)): + elem_offsets = [] + start = self._pos + for val in obj: + elem_offsets.append(self._pos - start) + self.build_python(val) + self._finish_writing_array(start, elem_offsets) + elif isinstance(obj, bytes): + self.append_binary(obj) + else: + raise TypeError(f'Unsupported Python type for variant encoding: {type(obj).__name__}') + + def _try_decimal_or_double(self, d): + try: + sign, digits, exponent = d.as_tuple() + if exponent > 0: + self.append_double(float(d)) + return + scale = -exponent if exponent < 0 else 0 + precision = len(digits) + if scale <= _MAX_DECIMAL16_PRECISION and precision <= _MAX_DECIMAL16_PRECISION: + self.append_decimal(d) + return + except (ArithmeticError, ValueError): + pass + self.append_double(float(d)) + + def result(self): + """Build metadata and return the completed GenericVariant.""" + n_keys = len(self._keys) + total_str_size = sum(len(k) for k in self._keys) + max_size = max(total_str_size, n_keys, 0) + offset_size = _get_int_size(max_size) if max_size > 0 else 1 + + offset_start = 1 + offset_size + string_start = offset_start + (n_keys + 1) * offset_size + metadata_size = string_start + total_str_size + + metadata = bytearray(metadata_size) + metadata[0] = _VERSION | ((offset_size - 1) << 6) + _write_le(metadata, 1, n_keys, offset_size) + + current_offset = 0 + for i, key_bytes in enumerate(self._keys): + _write_le(metadata, offset_start + i * offset_size, current_offset, offset_size) + klen = len(key_bytes) + metadata[string_start + current_offset:string_start + current_offset + klen] = key_bytes + current_offset += klen + _write_le(metadata, offset_start + n_keys * offset_size, current_offset, offset_size) + + return GenericVariant(bytes(self._buf[:self._pos]), bytes(metadata)) + + +# --------------------------------------------------------------------------- +# GenericVariant +# --------------------------------------------------------------------------- + +class GenericVariant: + """Storage container for a Paimon/Parquet VARIANT value. + + A VARIANT value is stored as two byte arrays: + value – encoded payload (Parquet Variant binary spec) + metadata – key dictionary for object field names + + pypaimon exposes VARIANT columns as Arrow struct arrays with exactly these + two fields. This class helps Python code build or inspect those bytes. + + **Writing example**:: + + import pyarrow as pa + from pypaimon.data.generic_variant import GenericVariant + + gv1 = GenericVariant.from_json('{"age": 30, "city": "Beijing"}') + gv2 = GenericVariant.from_json('[1, 2, 3]') + # Create an Arrow StructArray ready for writing + col = GenericVariant.to_arrow_array([gv1, gv2, None]) + table = pa.table({'id': [1, 2, 3], 'payload': col}) + + **Reading example**:: + + result = table_read.to_arrow(splits) + # 'payload' column is struct + for row in result.column('payload').to_pylist(): + if row is not None: + gv = GenericVariant.from_arrow_struct(row) + print(gv.to_json()) # e.g. '{"age":30,"city":"Beijing"}' + print(gv.to_python()) # e.g. {'age': 30, 'city': 'Beijing'} + """ + + __slots__ = ('_value', '_metadata', '_pos') + + def __init__(self, value: bytes, metadata: bytes, _pos: int = 0): + self._value = bytes(value) + self._metadata = bytes(metadata) + self._pos = _pos + if len(metadata) < 1 or (metadata[0] & _VERSION_MASK) != _VERSION: + raise ValueError('MALFORMED_VARIANT: invalid metadata version') + + # -- constructors -- + + @classmethod + def from_json(cls, json_str: str) -> 'GenericVariant': + """Parse a JSON string and encode it as VARIANT binary bytes. + + Use this when writing VARIANT data from Python:: + + gv = GenericVariant.from_json('{"name": "Alice", "age": 30}') + col = GenericVariant.to_arrow_array([gv]) + """ + obj = _json.loads(json_str, parse_float=_decimal.Decimal) + builder = _GenericVariantBuilder() + builder.build_python(obj) + return builder.result() + + @classmethod + def from_python(cls, obj) -> 'GenericVariant': + """Encode a Python object (dict / list / str / int / float / bool / None) as VARIANT. + + Use this when writing VARIANT data from Python:: + + gv = GenericVariant.from_python({'name': 'Alice', 'age': 30}) + col = GenericVariant.to_arrow_array([gv]) + """ + builder = _GenericVariantBuilder() + builder.build_python(obj) + return builder.result() + + @classmethod + def from_arrow_struct(cls, d: dict) -> 'GenericVariant': + """Wrap raw bytes from a PyArrow VARIANT struct: {'value': bytes, 'metadata': bytes}. + + Use this on the read path after reading a VARIANT column:: + + for row in result.column("payload").to_pylist(): + if row is not None: + gv = GenericVariant.from_arrow_struct(row) + print(gv.to_json()) + """ + return cls(bytes(d['value']), bytes(d['metadata'])) + + @classmethod + def to_arrow_array(cls, variants): + """Convert a list of GenericVariant (or None) to a PyArrow StructArray. + + The returned array has the canonical VARIANT layout:: + + struct + + Pass None in the list to represent a SQL NULL (absent VARIANT value). + + Example:: + + gv1 = GenericVariant.from_json('{"age":30}') + gv2 = GenericVariant.from_json('[1,2,3]') + col = GenericVariant.to_arrow_array([gv1, gv2, None]) + table = pa.table({'id': [1, 2, 3], 'payload': col}) + """ + import pyarrow as _pa + + values = [] + metadatas = [] + mask = [] + for v in variants: + if v is None: + values.append(b'') + metadatas.append(b'') + mask.append(True) + else: + values.append(v.value()) + metadatas.append(v.metadata()) + mask.append(False) + + variant_type = _pa.struct([ + _pa.field('value', _pa.binary(), nullable=False), + _pa.field('metadata', _pa.binary(), nullable=False), + ]) + return _pa.StructArray.from_arrays( + [_pa.array(values, type=_pa.binary()), + _pa.array(metadatas, type=_pa.binary())], + fields=[variant_type[0], variant_type[1]], + mask=_pa.array(mask, type=_pa.bool_()), + ) + + # -- raw bytes -- + + def value(self) -> bytes: + """Return the value payload bytes.""" + if self._pos == 0: + return self._value + size = _value_size(self._value, self._pos) + return self._value[self._pos:self._pos + size] + + def metadata(self) -> bytes: + """Return the metadata (key-dictionary) bytes.""" + return self._metadata + + # -- inspection helpers (for debugging / testing) -- + + def to_json(self) -> str: + """Decode the variant to a JSON string. + + Useful for debugging and testing. Variant semantics and path-based + queries are the responsibility of the application layer. + """ + parts = [] + self._to_json_impl(self._value, self._metadata, self._pos, parts) + return ''.join(parts) + + def _to_json_impl(self, value, metadata, pos, parts): + vtype = _variant_get_type(value, pos) + if vtype == _Type.OBJECT: + def _render(size, id_size, offset_size, id_start, offset_start, data_start): + parts.append('{') + for i in range(size): + fid = _read_unsigned(value, id_start + id_size * i, id_size) + key = _get_metadata_key(metadata, fid) + offset = _read_unsigned( + value, offset_start + offset_size * i, offset_size) + if i != 0: + parts.append(',') + parts.append(_json.dumps(key)) + parts.append(':') + self._to_json_impl(value, metadata, data_start + offset, parts) + parts.append('}') + _handle_object(value, pos, _render) + elif vtype == _Type.ARRAY: + def _render_arr(size, offset_size, offset_start, data_start): + parts.append('[') + for i in range(size): + offset = _read_unsigned( + value, offset_start + offset_size * i, offset_size) + if i != 0: + parts.append(',') + self._to_json_impl(value, metadata, data_start + offset, parts) + parts.append(']') + _handle_array(value, pos, _render_arr) + else: + b = value[pos] + basic_type = b & 0x3 + type_info = (b >> 2) & 0x3F + if vtype == _Type.NULL: + parts.append('null') + elif vtype == _Type.BOOLEAN: + parts.append('true' if type_info == _TRUE else 'false') + elif vtype == _Type.LONG: + n = _LONG_FAMILY_SIZES.get(type_info) + parts.append(str(_read_signed(value, pos + 1, n))) + elif vtype == _Type.STRING: + if basic_type == _SHORT_STR: + s = value[pos + 1:pos + 1 + type_info].decode('utf-8') + else: + length = _read_unsigned(value, pos + 1, _U32_SIZE) + s = value[pos + 1 + _U32_SIZE:pos + 1 + _U32_SIZE + length].decode('utf-8') + parts.append(_json.dumps(s)) + elif vtype == _Type.DOUBLE: + d = struct.unpack_from('> 2) & 0x3F + + if vtype == _Type.NULL: + return None + if vtype == _Type.BOOLEAN: + return type_info == _TRUE + if vtype == _Type.LONG: + n = _LONG_FAMILY_SIZES.get(type_info) + return _read_signed(value, pos + 1, n) + if vtype == _Type.DOUBLE: + return struct.unpack_from(' str: + return f'GenericVariant({self.to_json()!r})' + + def __str__(self) -> str: + return self.to_json() + + def __eq__(self, other) -> bool: + if not isinstance(other, GenericVariant): + return NotImplemented + return self.value() == other.value() and self._metadata == other._metadata + + def __hash__(self): + return hash((self.value(), self._metadata)) diff --git a/paimon-python/pypaimon/data/variant_shredding.py b/paimon-python/pypaimon/data/variant_shredding.py new file mode 100644 index 000000000000..04502eebe13e --- /dev/null +++ b/paimon-python/pypaimon/data/variant_shredding.py @@ -0,0 +1,979 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +################################################################################ + +"""Shredded Parquet VARIANT support for pypaimon. + +Paimon stores VARIANT columns in Parquet using a "shredded" format that enables +efficient sub-field reading and predicate pushdown. This module provides: + + 1. VariantSchema / build_variant_schema() + Parse the PyArrow struct type of a shredded VARIANT column into a tree + that mirrors the Java ``VariantSchema`` class. + + 2. rebuild_value() / rebuild() + Reconstruct standard ``(value: bytes, metadata: bytes)`` VARIANT binary + from a shredded row dict. Mirrors ``ShreddingUtils.rebuild()`` in Java. + + 3. assemble_shredded_column() + High-level helper used by ``FormatPyArrowReader`` to post-process batches + that contain shredded VARIANT columns. + + 4. is_shredded_variant() + Detect shredded VARIANT columns in a file schema. + +Shredded column layout (Parquet GROUP → PyArrow struct): + - ``metadata``: binary — the top-level key dictionary + - ``value``: binary (optional) — overflow bytes for un-shredded fields + - ``typed_value``: struct — per-field typed sub-columns (shredded fields) + +Each field inside ``typed_value`` has the same ``{value, typed_value}`` +structure recursively. No ``field_`` prefix is used; sub-column names are +the exact variant key names. +""" + +from __future__ import annotations + +import datetime +import decimal as _decimal +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import pyarrow as pa + +# --------------------------------------------------------------------------- +# Variant binary constants (mirror generic_variant.py) +# --------------------------------------------------------------------------- + +_PRIMITIVE = 0 +_SHORT_STR = 1 +_OBJECT = 2 +_ARRAY = 3 + +_NULL_TYPE_ID = 0 +_TRUE_TYPE_ID = 1 +_FALSE_TYPE_ID = 2 + +_U8_MAX = 255 +_U32_SIZE = 4 +_VERSION = 1 +_VERSION_MASK = 0x0F + +_NULL_VALUE_BYTES: bytes = bytes([((_NULL_TYPE_ID << 2) | _PRIMITIVE)]) + + +# --------------------------------------------------------------------------- +# Low-level binary helpers +# --------------------------------------------------------------------------- + +def _read_unsigned(data: bytes, pos: int, n: int) -> int: + return int.from_bytes(data[pos:pos + n], 'little', signed=False) + + +def _get_int_size(value: int) -> int: + if value <= 0xFF: + return 1 + if value <= 0xFFFF: + return 2 + if value <= 0xFFFFFF: + return 3 + return 4 + + +def _append_le(buf: bytearray, value: int, n: int) -> None: + buf.extend(value.to_bytes(n, 'little')) + + +def _primitive_header(type_id: int) -> int: + return (type_id << 2) | _PRIMITIVE + + +def _object_header(large_size: bool, id_size: int, offset_size: int) -> int: + return ( + ((1 if large_size else 0) << 6) + | ((id_size - 1) << 4) + | ((offset_size - 1) << 2) + | _OBJECT + ) + + +def _array_header(large_size: bool, offset_size: int) -> int: + return ( + ((1 if large_size else 0) << 4) + | ((offset_size - 1) << 2) + | _ARRAY + ) + + +# --------------------------------------------------------------------------- +# Metadata parsing +# --------------------------------------------------------------------------- + +def parse_metadata_dict(metadata: bytes) -> Dict[str, int]: + """Parse variant metadata bytes into a ``{key_name: key_id}`` mapping. + + The top-level metadata is shared across all shredded sub-fields. We parse + it once and pass the dict down to ``rebuild_value()`` so every recursive + call can look up key IDs without re-parsing. + """ + if not metadata or len(metadata) < 1: + return {} + if (metadata[0] & _VERSION_MASK) != _VERSION: + raise ValueError('MALFORMED_VARIANT: invalid metadata version') + offset_size = ((metadata[0] >> 6) & 0x3) + 1 + if len(metadata) < 1 + offset_size: + return {} + dict_size = _read_unsigned(metadata, 1, offset_size) + if dict_size == 0: + return {} + string_start = 1 + (dict_size + 2) * offset_size + result: Dict[str, int] = {} + for key_id in range(dict_size): + off = _read_unsigned(metadata, 1 + (key_id + 1) * offset_size, offset_size) + next_off = _read_unsigned(metadata, 1 + (key_id + 2) * offset_size, offset_size) + key = metadata[string_start + off:string_start + next_off].decode('utf-8') + result[key] = key_id + return result + + +# --------------------------------------------------------------------------- +# VariantSchema / build_variant_schema +# --------------------------------------------------------------------------- + +@dataclass +class ObjectField: + """One shredded field inside a ``typed_value`` object group.""" + field_name: str # exact variant key name, no "field_" prefix + schema: 'VariantSchema' + + +@dataclass +class VariantSchema: + """Describes the shredding layout of a VARIANT column or sub-column. + + Mirrors the Java ``VariantSchema`` class. Indices are positions within + the PyArrow struct type that was parsed via ``build_variant_schema()``. + + For a plain (un-shredded) VARIANT: + ``metadata_idx >= 0``, ``value_idx >= 0``, ``typed_idx < 0``. + """ + typed_idx: int = -1 + value_idx: int = -1 + metadata_idx: int = -1 + num_fields: int = 0 + scalar_arrow_type: Optional[pa.DataType] = None + object_fields: Optional[List[ObjectField]] = None + object_schema_map: Optional[Dict[str, int]] = None + array_schema: Optional['VariantSchema'] = None + + def is_unshredded(self) -> bool: + """Return True if this is a plain (non-shredded) VARIANT layout.""" + return self.metadata_idx >= 0 and self.typed_idx < 0 + + +def is_shredded_variant(pa_type: pa.DataType) -> bool: + """Return True if *pa_type* is a shredded Parquet VARIANT struct. + + A shredded VARIANT column has three top-level fields: ``metadata``, + ``value`` (overflow), and ``typed_value`` (shredded sub-columns). + """ + if not pa.types.is_struct(pa_type): + return False + names = {pa_type.field(i).name for i in range(pa_type.num_fields)} + return 'metadata' in names and 'value' in names and 'typed_value' in names + + +def build_variant_schema(pa_type: pa.StructType) -> VariantSchema: + """Parse a PyArrow struct type into a ``VariantSchema`` tree. + + Works for both the top-level VARIANT column struct (which has ``metadata`` + in addition to ``value`` / ``typed_value``) and sub-field structs (which + only have ``value`` / ``typed_value``). + """ + schema = VariantSchema(num_fields=pa_type.num_fields) + for i in range(pa_type.num_fields): + f = pa_type.field(i) + if f.name == 'metadata': + schema.metadata_idx = i + elif f.name == 'value': + schema.value_idx = i + elif f.name == 'typed_value': + schema.typed_idx = i + schema = _parse_typed_value_field(schema, f.type) + return schema + + +def _parse_typed_value_field(schema: VariantSchema, tv_type: pa.DataType) -> VariantSchema: + """Fill in the shredding details for a ``typed_value`` field.""" + if pa.types.is_struct(tv_type): + object_fields: List[ObjectField] = [] + for j in range(tv_type.num_fields): + sub_f = tv_type.field(j) + if pa.types.is_struct(sub_f.type): + sub_schema = build_variant_schema(sub_f.type) + else: + # Scalar typed_value embedded directly (no surrounding struct) + sub_schema = VariantSchema( + typed_idx=0, num_fields=1, scalar_arrow_type=sub_f.type + ) + object_fields.append(ObjectField(sub_f.name, sub_schema)) + schema.object_fields = object_fields + schema.object_schema_map = { + of.field_name: idx for idx, of in enumerate(object_fields) + } + elif pa.types.is_list(tv_type) or pa.types.is_large_list(tv_type): + elem_type = tv_type.value_type + if pa.types.is_struct(elem_type): + schema.array_schema = build_variant_schema(elem_type) + else: + schema.array_schema = VariantSchema( + typed_idx=0, num_fields=1, scalar_arrow_type=elem_type + ) + else: + schema.scalar_arrow_type = tv_type + return schema + + +# --------------------------------------------------------------------------- +# Scalar encoding: Arrow typed value → variant binary bytes +# --------------------------------------------------------------------------- + +def _encode_scalar_to_value_bytes(typed_value, arrow_type: pa.DataType) -> bytes: + """Encode a typed Python scalar (from PyArrow .as_py()) to variant value bytes.""" + from pypaimon.data.generic_variant import _GenericVariantBuilder # local import avoids circular + builder = _GenericVariantBuilder() + _append_scalar(builder, typed_value, arrow_type) + gv = builder.result() + # _pos == 0 so value() returns all bytes + return gv.value() + + +def _append_scalar(builder, value, arrow_type: pa.DataType) -> None: + """Dispatch a typed Python scalar into the appropriate builder method.""" + if value is None: + builder.append_null() + return + + if pa.types.is_boolean(arrow_type): + builder.append_boolean(bool(value)) + elif pa.types.is_integer(arrow_type): + builder.append_long(int(value)) + elif pa.types.is_float64(arrow_type): + builder.append_double(float(value)) + elif pa.types.is_float32(arrow_type): + builder.append_float(float(value)) + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + builder.append_string(str(value)) + elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): + builder.append_binary(bytes(value)) + elif pa.types.is_date32(arrow_type): + # PyArrow converts date32 to datetime.date + if isinstance(value, datetime.date): + days = (value - datetime.date(1970, 1, 1)).days + else: + days = int(value) + builder.append_date(days) + elif pa.types.is_timestamp(arrow_type): + # PyArrow converts timestamp to datetime.datetime + if isinstance(value, datetime.datetime): + if value.tzinfo is not None: + epoch = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) + micros = int((value - epoch).total_seconds() * 1_000_000) + builder.append_timestamp(micros) + else: + epoch = datetime.datetime(1970, 1, 1) + micros = int((value - epoch).total_seconds() * 1_000_000) + builder.append_timestamp_ntz(micros) + else: + builder.append_timestamp_ntz(int(value)) + elif pa.types.is_decimal(arrow_type): + if isinstance(value, _decimal.Decimal): + builder.append_decimal(value) + else: + builder.append_decimal(_decimal.Decimal(str(value))) + else: + # Fallback: encode as string + builder.append_string(str(value)) + + +# --------------------------------------------------------------------------- +# Object / array binary construction +# --------------------------------------------------------------------------- + +def _build_object_value(fields: List[Tuple[int, bytes]]) -> bytes: + """Build object variant value bytes from ``(key_id, value_bytes)`` pairs. + + The variant spec requires fields sorted by key_id. + """ + if not fields: + # Empty object: header + size=0 + one zero-offset sentinel + buf = bytearray() + buf.append(_object_header(False, 1, 1)) + buf.append(0) # size = 0 + buf.append(0) # offset[0] = 0 (sentinel) + return bytes(buf) + + fields = sorted(fields, key=lambda f: f[0]) + size = len(fields) + data = b''.join(vb for _, vb in fields) + data_size = len(data) + + large_size = size > _U8_MAX + size_bytes = _U32_SIZE if large_size else 1 + max_id = max(kid for kid, _ in fields) + id_size = _get_int_size(max_id) + offset_size = _get_int_size(data_size) if data_size > 0 else 1 + + buf = bytearray() + buf.append(_object_header(large_size, id_size, offset_size)) + _append_le(buf, size, size_bytes) + + for kid, _ in fields: + _append_le(buf, kid, id_size) + + offset = 0 + for _, vb in fields: + _append_le(buf, offset, offset_size) + offset += len(vb) + _append_le(buf, offset, offset_size) # sentinel = total data size + + buf.extend(data) + return bytes(buf) + + +def _build_array_value(element_bytes_list: List[bytes]) -> bytes: + """Build array variant value bytes from per-element value bytes.""" + size = len(element_bytes_list) + data = b''.join(element_bytes_list) + data_size = len(data) + + large_size = size > _U8_MAX + size_bytes = _U32_SIZE if large_size else 1 + offset_size = _get_int_size(data_size) if data_size > 0 else 1 + + buf = bytearray() + buf.append(_array_header(large_size, offset_size)) + _append_le(buf, size, size_bytes) + + offset = 0 + for eb in element_bytes_list: + _append_le(buf, offset, offset_size) + offset += len(eb) + _append_le(buf, offset, offset_size) # sentinel + + buf.extend(data) + return bytes(buf) + + +def _extract_overflow_fields(overflow_bytes: bytes) -> List[Tuple[int, bytes]]: + """Parse an overflow binary (a variant object) into ``(key_id, value_bytes)`` pairs. + + The overflow binary contains fields that were NOT shredded — they remain + encoded as a compact variant object. + """ + if not overflow_bytes: + return [] + + b = overflow_bytes[0] + basic_type = b & 0x3 + if basic_type != _OBJECT: + return [] + + type_info = (b >> 2) & 0x3F + large_size = bool((type_info >> 4) & 0x1) + size_bytes = _U32_SIZE if large_size else 1 + size = _read_unsigned(overflow_bytes, 1, size_bytes) + if size == 0: + return [] + + id_size = ((type_info >> 2) & 0x3) + 1 + offset_size = (type_info & 0x3) + 1 + id_start = 1 + size_bytes + offset_start = id_start + size * id_size + data_start = offset_start + (size + 1) * offset_size + + # The Parquet variant spec stores field offsets in the same order as the id_table, + # but the DATA section may be laid out in a different order (e.g. GenericVariantBuilder + # sorts the id/offset tables alphabetically while writing data in insertion order). + # We must sort by offset to determine each field's byte boundaries correctly. + pairs: List[Tuple[int, int]] = [] + for i in range(size): + key_id = _read_unsigned(overflow_bytes, id_start + i * id_size, id_size) + off = _read_unsigned(overflow_bytes, offset_start + i * offset_size, offset_size) + pairs.append((key_id, off)) + + sentinel = _read_unsigned(overflow_bytes, offset_start + size * offset_size, offset_size) + + # Sort by offset so that adjacent entries define contiguous data boundaries. + sorted_pairs = sorted(pairs, key=lambda p: p[1]) + boundaries = [p[1] for p in sorted_pairs] + [sentinel] + offset_to_end = {boundaries[j]: boundaries[j + 1] for j in range(len(boundaries) - 1)} + + fields: List[Tuple[int, bytes]] = [] + for key_id, off in pairs: + end = offset_to_end[off] + field_bytes = bytes(overflow_bytes[data_start + off:data_start + end]) + fields.append((key_id, field_bytes)) + return fields + + +# --------------------------------------------------------------------------- +# Core rebuild algorithm (mirrors ShreddingUtils.rebuild() in Java) +# --------------------------------------------------------------------------- + +def rebuild_value( + row: dict, + schema: VariantSchema, + key_dict: Dict[str, int], +) -> Optional[bytes]: + """Reconstruct variant value bytes from a shredded sub-row dict. + + Args: + row: Python dict from ``PyArrow StructScalar.as_py()``. Keys are + ``'value'`` and/or ``'typed_value'``. + schema: VariantSchema for this level. + key_dict: ``{key_name: key_id}`` parsed from the top-level metadata. + + Returns: + Variant value bytes, or ``None`` if the field is absent (both + ``typed_value`` and ``value`` are null — "missing from this row"). + """ + typed_value = row.get('typed_value') if schema.typed_idx >= 0 else None + overflow = row.get('value') if schema.value_idx >= 0 else None + + # if both null → field is absent in this row + if typed_value is None and overflow is None: + return None + + # if typed_value is null → use overflow bytes directly + if typed_value is None: + return bytes(overflow) + + if schema.scalar_arrow_type is not None: + return _encode_scalar_to_value_bytes(typed_value, schema.scalar_arrow_type) + + if schema.object_fields is not None: + return _rebuild_object(typed_value, schema, key_dict, overflow) + + if schema.array_schema is not None: + return _rebuild_array(typed_value, schema.array_schema, key_dict) + + # No sub-schema for typed_value; fall back to overflow + return bytes(overflow) if overflow is not None else None + + +def _rebuild_object( + typed_value: dict, + schema: VariantSchema, + key_dict: Dict[str, int], + overflow_bytes: Optional[bytes], +) -> bytes: + """Rebuild an object variant from shredded object sub-fields.""" + fields: List[Tuple[int, bytes]] = [] + + for obj_field in schema.object_fields: + fname = obj_field.field_name + sub_row = typed_value.get(fname) if isinstance(typed_value, dict) else None + if sub_row is None: + continue + + field_value = rebuild_value(sub_row, obj_field.schema, key_dict) + if field_value is None: + continue + + key_id = key_dict.get(fname) + if key_id is None: + # Key not found in metadata — data integrity issue, skip + continue + fields.append((key_id, field_value)) + + # Merge with overflow (fields not shredded) + if overflow_bytes: + fields.extend(_extract_overflow_fields(bytes(overflow_bytes))) + + return _build_object_value(fields) + + +def _rebuild_array( + typed_value, + element_schema: VariantSchema, + key_dict: Dict[str, int], +) -> bytes: + """Rebuild an array variant from a shredded list.""" + if typed_value is None: + typed_value = [] + + element_bytes_list: List[bytes] = [] + for element_row in typed_value: + if element_row is None: + element_bytes_list.append(_NULL_VALUE_BYTES) + elif isinstance(element_row, dict): + eb = rebuild_value(element_row, element_schema, key_dict) + element_bytes_list.append(eb if eb is not None else _NULL_VALUE_BYTES) + else: + # Scalar element directly (no surrounding {value, typed_value} struct) + if element_schema.scalar_arrow_type is not None: + eb = _encode_scalar_to_value_bytes(element_row, element_schema.scalar_arrow_type) + else: + eb = _NULL_VALUE_BYTES + element_bytes_list.append(eb) + + return _build_array_value(element_bytes_list) + + +def rebuild( + row: dict, + schema: VariantSchema, + key_dict: Optional[Dict[str, int]] = None, +) -> Tuple[bytes, bytes]: + """Reconstruct ``(value_bytes, metadata_bytes)`` from a top-level shredded row. + + Args: + row: Top-level shredded row dict (contains ``'metadata'``, + ``'value'``, and/or ``'typed_value'`` keys). + schema: Top-level VariantSchema (from ``build_variant_schema()``). + key_dict: Optional pre-parsed ``{key_name: key_id}`` mapping. If None, + it is parsed from ``row['metadata']`` automatically. + + Returns: + ``(value_bytes, metadata_bytes)`` suitable for constructing a standard + VARIANT ``struct`` row. + """ + raw_metadata = row.get('metadata') + if raw_metadata is None: + raise ValueError("Shredded VARIANT row missing 'metadata' field") + metadata = bytes(raw_metadata) + + if key_dict is None: + key_dict = parse_metadata_dict(metadata) + + value_bytes = rebuild_value(row, schema, key_dict) + if value_bytes is None: + value_bytes = _NULL_VALUE_BYTES + + return value_bytes, metadata + + +# --------------------------------------------------------------------------- +# High-level column assembly +# --------------------------------------------------------------------------- + +#: The canonical VARIANT Arrow type (struct). +VARIANT_ARROW_TYPE = pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), +]) + + +def assemble_shredded_column(column: pa.Array, schema: VariantSchema) -> pa.Array: + """Convert a shredded VARIANT column to standard ``struct``. + + Args: + column: A PyArrow Array whose type is a shredded VARIANT struct. + schema: VariantSchema built from the column's type. + + Returns: + A ``pa.StructArray`` with type ``struct``. + """ + rows = column.to_pylist() + assembled = [] + # all rows in a batch typically share the same metadata; cache the parsed dict + key_dict_cache: Optional[Dict[str, int]] = None + + for row in rows: + if row is None: + assembled.append(None) + continue + + raw_meta = row.get('metadata') + if raw_meta is None: + assembled.append(None) + continue + + metadata = bytes(raw_meta) + if key_dict_cache is None: + key_dict_cache = parse_metadata_dict(metadata) + + value_bytes = rebuild_value(row, schema, key_dict_cache) + if value_bytes is None: + value_bytes = _NULL_VALUE_BYTES + + assembled.append({'value': value_bytes, 'metadata': metadata}) + + return pa.array(assembled, type=VARIANT_ARROW_TYPE) + + +# --------------------------------------------------------------------------- +# Write-side shredding +# --------------------------------------------------------------------------- + +def _paimon_type_str_to_arrow(type_str: str) -> pa.DataType: + """Map a Paimon SQL type string to the Arrow type used in ``typed_value``.""" + from pypaimon.schema.data_types import AtomicType, PyarrowFieldParser + try: + return PyarrowFieldParser.from_paimon_type(AtomicType(type_str.upper())) + except Exception: + return pa.binary() + + +def _parse_sub_schema(type_def) -> VariantSchema: + """Recursively parse a Paimon type definition (string or dict) into a VariantSchema. + + The resulting schema is for a sub-field struct, so ``value_idx=0``, + ``typed_idx=1``, no ``metadata_idx``. + """ + if isinstance(type_def, str): + # Scalar leaf: e.g. "BIGINT", "VARCHAR", "DOUBLE" + arrow_type = _paimon_type_str_to_arrow(type_def) + return VariantSchema( + value_idx=0, + typed_idx=1, + num_fields=2, + scalar_arrow_type=arrow_type, + ) + + if not isinstance(type_def, dict): + return VariantSchema(value_idx=0, num_fields=1) + + kind = type_def.get('type', '').upper() + + if kind == 'ROW': + sub_fields_def = type_def.get('fields', []) + object_fields: List[ObjectField] = [] + for f in sub_fields_def: + fname = f.get('name', '') + ftype = f.get('type', 'BINARY') + sub_schema = _parse_sub_schema(ftype) + object_fields.append(ObjectField(fname, sub_schema)) + schema = VariantSchema( + value_idx=0, + typed_idx=1, + num_fields=2, + object_fields=object_fields, + object_schema_map={of.field_name: i for i, of in enumerate(object_fields)}, + ) + return schema + + if kind == 'ARRAY': + elem_type = type_def.get('element', 'BINARY') + elem_schema = _parse_sub_schema(elem_type) + return VariantSchema( + value_idx=0, + typed_idx=1, + num_fields=2, + array_schema=elem_schema, + ) + + # Fallback: treat as scalar using the type string + arrow_type = _paimon_type_str_to_arrow(kind) + return VariantSchema( + value_idx=0, + typed_idx=1, + num_fields=2, + scalar_arrow_type=arrow_type, + ) + + +def parse_shredding_schema_option(json_str: str) -> Dict[str, List[ObjectField]]: + """Parse the ``variant.shreddingSchema`` option value. + + Args: + json_str: JSON-encoded Paimon ROW type where each top-level field + corresponds to a VARIANT column name, and its type is a ROW + listing the sub-fields to shred. + + Returns: + ``{variant_col_name: [ObjectField, ...]}`` mapping. + + Raises: + ValueError: if the JSON is invalid or the top-level type is not ROW. + """ + import json as _json + data = _json.loads(json_str) + if data.get('type', '').upper() != 'ROW': + raise ValueError( + f"variant.shreddingSchema must be a JSON-encoded ROW type, got: {data.get('type')}" + ) + + result: Dict[str, List[ObjectField]] = {} + for field_def in data.get('fields', []): + col_name = field_def.get('name', '') + col_type = field_def.get('type', {}) + + if isinstance(col_type, dict) and col_type.get('type', '').upper() == 'ROW': + sub_fields_def = col_type.get('fields', []) + obj_fields: List[ObjectField] = [] + for sf in sub_fields_def: + fname = sf.get('name', '') + ftype = sf.get('type', 'BINARY') + sub_schema = _parse_sub_schema(ftype) + obj_fields.append(ObjectField(fname, sub_schema)) + result[col_name] = obj_fields + else: + # Top-level field type is not a ROW — skip (can't shred a non-object schema) + pass + + return result + + +def _fid(field_id: int) -> dict: + """Return PyArrow field metadata dict that sets the Parquet field ID. + + PyArrow respects the ``PARQUET:field_id`` key when writing Parquet, which + ensures ``parquetType.getId()`` returns a non-null value on the Java reader + side (``ParquetSchemaConverter.convertToPaimonField`` calls + ``parquetType.getId().intValue()`` unconditionally). + """ + return {b'PARQUET:field_id': str(field_id).encode()} + + +def _leaf_arrow_type_for_write(schema: VariantSchema) -> pa.DataType: + """Return the Arrow type for the ``typed_value`` leaf of a sub-field struct. + + Used by ``sub_field_output_type``; field IDs are NOT embedded here since the + result describes output column types, not Parquet-serialised fields. + """ + if schema.scalar_arrow_type is not None: + return schema.scalar_arrow_type + if schema.object_fields is not None: + return pa.struct([ + pa.field(of.field_name, pa.struct([ + pa.field('value', pa.binary(), nullable=True), + pa.field('typed_value', _leaf_arrow_type_for_write(of.schema), nullable=True), + ]), nullable=True) + for of in schema.object_fields + ]) + return pa.binary() + + +def _leaf_arrow_type_for_write_with_ids(schema: VariantSchema) -> pa.DataType: + """Like ``_leaf_arrow_type_for_write`` but embeds ``PARQUET:field_id`` metadata. + + Used by ``shredding_schema_to_arrow_type`` so that Parquet field IDs are + present in every nested field of the written file. + """ + if schema.scalar_arrow_type is not None: + return schema.scalar_arrow_type + if schema.object_fields is not None: + inner_fields = [] + for i, of in enumerate(schema.object_fields): + inner_sub = pa.struct([ + pa.field('value', pa.binary(), nullable=True, metadata=_fid(0)), + pa.field( + 'typed_value', + _leaf_arrow_type_for_write_with_ids(of.schema), + nullable=True, + metadata=_fid(1), + ), + ]) + inner_fields.append( + pa.field(of.field_name, inner_sub, nullable=False, metadata=_fid(i + 1)) + ) + return pa.struct(inner_fields) + return pa.binary() + + +def shredding_schema_to_arrow_type(obj_fields: List[ObjectField]) -> pa.StructType: + """Convert an ``[ObjectField]`` list into the PyArrow struct type for a shredded column. + + The produced type is the canonical Parquet shredding layout:: + + struct< + metadata: binary NOT NULL, (field_id=0) + value: binary, (field_id=1) + typed_value: struct< (field_id=2) + field_a: struct< (field_id=1, NOT NULL) + value: binary, (field_id=0) + typed_value: (field_id=1) + >, + ... + > + > + + ``PARQUET:field_id`` metadata is embedded on every field so that the Java + ``ParquetSchemaConverter.convertToPaimonField`` can call + ``parquetType.getId().intValue()`` without a NullPointerException. + + Sub-field structs within ``typed_value`` are marked NOT NULL (``nullable=False``) + to match the Java shredding schema where each named sub-field carries a + ``.notNull()`` annotation. + """ + sub_field_defs = [] + for i, of in enumerate(obj_fields): + typed_val_type = _leaf_arrow_type_for_write_with_ids(of.schema) + sub_struct = pa.struct([ + pa.field('value', pa.binary(), nullable=True, metadata=_fid(0)), + pa.field('typed_value', typed_val_type, nullable=True, metadata=_fid(1)), + ]) + # Java's variantShreddingSchema marks each named sub-field as .notNull() + sub_field_defs.append(pa.field(of.field_name, sub_struct, nullable=False, metadata=_fid(i + 1))) + + return pa.struct([ + pa.field('metadata', pa.binary(), nullable=False, metadata=_fid(0)), + pa.field('value', pa.binary(), nullable=True, metadata=_fid(1)), + pa.field('typed_value', pa.struct(sub_field_defs), nullable=True, metadata=_fid(2)), + ]) + + +def _decompose_field_bytes( + field_bytes: bytes, + schema: VariantSchema, + metadata: bytes, +) -> dict: + """Decompose a variant field's value bytes into a ``{value, typed_value}`` sub-struct dict. + + This is the write-direction counterpart of ``rebuild_value()``. + + Args: + field_bytes: Variant value bytes for a single field extracted from the + original variant binary. + schema: VariantSchema describing how this field should be shredded. + metadata: Top-level metadata bytes (used for key ID lookups in nested objects). + + Returns: + A Python dict ``{'value': bytes_or_none, 'typed_value': val_or_dict_or_none}`` + suitable for building a PyArrow struct array row. + """ + from pypaimon.data.generic_variant import GenericVariant + + if schema.scalar_arrow_type is not None: + try: + py_val = GenericVariant(field_bytes, metadata).to_python() + except Exception: + return {'value': field_bytes, 'typed_value': None} + return {'value': None, 'typed_value': py_val} + + if schema.object_fields is not None: + if not field_bytes or (field_bytes[0] & 0x3) != _OBJECT: + return {'value': field_bytes, 'typed_value': None} + + all_sub = _extract_overflow_fields(field_bytes) + key_dict = parse_metadata_dict(metadata) + id_to_name = {v: k for k, v in key_dict.items()} + shredded_names = {of.field_name for of in schema.object_fields} + + overflow_pairs = [ + (kid, fb) for kid, fb in all_sub + if id_to_name.get(kid) not in shredded_names + ] + shredded_by_name = { + id_to_name[kid]: fb for kid, fb in all_sub + if kid in id_to_name and id_to_name[kid] in shredded_names + } + + typed_value: Dict[str, dict] = {} + for of in schema.object_fields: + fname = of.field_name + if fname in shredded_by_name: + typed_value[fname] = _decompose_field_bytes( + shredded_by_name[fname], of.schema, metadata + ) + else: + typed_value[fname] = {'value': None, 'typed_value': None} + + overflow_bytes = _build_object_value(overflow_pairs) if overflow_pairs else None + return {'value': overflow_bytes, 'typed_value': typed_value} + + # No shredding sub-schema: treat field bytes as overflow + return {'value': field_bytes, 'typed_value': None} + + +def decompose_variant( + gv: 'GenericVariant', + obj_fields: List[ObjectField], +) -> dict: + """Decompose a ``GenericVariant`` into a shredded row dict for writing. + + This is the inverse of ``rebuild()`` / ``rebuild_value()`` — it takes a + fully-encoded variant and splits it into the shredded ``{metadata, value, + typed_value}`` structure that Parquet shredding expects. + + Args: + gv: The ``GenericVariant`` to decompose. + obj_fields: List of ``ObjectField`` from ``parse_shredding_schema_option()``, + describing which top-level object keys to shred. + + Returns: + A Python dict matching the Arrow type produced by + ``shredding_schema_to_arrow_type(obj_fields)``. + """ + metadata = gv.metadata() + value_bytes = gv.value() + + # Non-object variants cannot be shredded: put everything in overflow and set + # typed_value to NULL so the Java reader falls through to the overflow path. + if not value_bytes or (value_bytes[0] & 0x3) != _OBJECT: + return {'metadata': metadata, 'value': value_bytes, 'typed_value': None} + + all_fields = _extract_overflow_fields(value_bytes) + key_dict = parse_metadata_dict(metadata) + id_to_name = {v: k for k, v in key_dict.items()} + shredded_names = {of.field_name for of in obj_fields} + + overflow_pairs = [ + (kid, fb) for kid, fb in all_fields + if id_to_name.get(kid) not in shredded_names + ] + shredded_by_name = { + id_to_name[kid]: fb for kid, fb in all_fields + if kid in id_to_name and id_to_name[kid] in shredded_names + } + + typed_value = {} + for of in obj_fields: + fname = of.field_name + if fname in shredded_by_name: + typed_value[fname] = _decompose_field_bytes( + shredded_by_name[fname], of.schema, metadata + ) + else: + typed_value[fname] = {'value': None, 'typed_value': None} + + overflow_bytes = _build_object_value(overflow_pairs) if overflow_pairs else None + return {'metadata': metadata, 'value': overflow_bytes, 'typed_value': typed_value} + + +def shred_variant_column( + column: pa.Array, + obj_fields: List[ObjectField], + target_type: pa.StructType, +) -> pa.Array: + """Convert a standard VARIANT column to its shredded representation. + + Args: + column: A ``pa.Array`` of type ``struct`` + (the standard Paimon VARIANT layout). + obj_fields: ``[ObjectField, ...]`` from ``parse_shredding_schema_option()``. + target_type: The Arrow struct type from ``shredding_schema_to_arrow_type(obj_fields)``. + + Returns: + A ``pa.StructArray`` with ``target_type`` suitable for writing to Parquet + in the shredded format. + """ + from pypaimon.data.generic_variant import GenericVariant + + rows = column.to_pylist() + result = [] + for row in rows: + if row is None: + result.append(None) + else: + gv = GenericVariant.from_arrow_struct(row) + result.append(decompose_variant(gv, obj_fields)) + return pa.array(result, type=target_type) diff --git a/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py b/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py index be80d63146ab..6be1f8f485a7 100644 --- a/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py +++ b/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py @@ -16,13 +16,19 @@ # limitations under the License. ################################################################################ -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional import pyarrow as pa import pyarrow.dataset as ds from pyarrow import RecordBatch from pypaimon.common.file_io import FileIO +from pypaimon.data.variant_shredding import ( + VariantSchema, + assemble_shredded_column, + build_variant_schema, + is_shredded_variant, +) from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader from pypaimon.schema.data_types import DataField, PyarrowFieldParser from pypaimon.table.special_fields import SpecialFields @@ -32,27 +38,62 @@ class FormatPyArrowReader(RecordBatchReader): """ A Format Reader that reads record batch from a Parquet or ORC file using PyArrow, and filters it based on the provided predicate and projection. + + When a VARIANT column is stored in the shredded Parquet format (a struct with + ``metadata``, ``value``, and ``typed_value`` fields), this reader transparently + reconstructs the standard ``struct`` representation. """ - def __init__(self, file_io: FileIO, file_format: str, file_path: str, - read_fields: List[DataField], - push_down_predicate: Any, batch_size: int = 1024): + def __init__( + self, + file_io: FileIO, + file_format: str, + file_path: str, + read_fields: List[DataField], + push_down_predicate: Any, + batch_size: int = 1024, + ): + """ + Args: + file_io: FileIO for the storage backend. + file_format: ``'parquet'`` or ``'orc'``. + file_path: Path to the data file. + read_fields: Fields to project (in order). + push_down_predicate: Optional Arrow expression predicate. + batch_size: Target rows per batch. + """ file_path_for_pyarrow = file_io.to_filesystem_path(file_path) - self.dataset = ds.dataset(file_path_for_pyarrow, format=file_format, filesystem=file_io.filesystem) + self.dataset = ds.dataset( + file_path_for_pyarrow, format=file_format, filesystem=file_io.filesystem + ) self._file_format = file_format self.read_fields = read_fields self._read_field_names = [f.name for f in read_fields] # Identify which fields exist in the file and which are missing - file_schema_names = set(self.dataset.schema.names) - self.existing_fields = [f.name for f in read_fields if f.name in file_schema_names] - self.missing_fields = [f.name for f in read_fields if f.name not in file_schema_names] + file_schema = self.dataset.schema + file_schema_names = set(file_schema.names) + self.existing_fields = [ + f.name for f in read_fields if f.name in file_schema_names + ] + self.missing_fields = [ + f.name for f in read_fields if f.name not in file_schema_names + ] + + # column name → VariantSchema for shredded columns that need assembly + self._shredded_schemas: Dict[str, VariantSchema] = {} + for name in self.existing_fields: + try: + field_type = file_schema.field(name).type + except KeyError: + continue + if is_shredded_variant(field_type): + self._shredded_schemas[name] = build_variant_schema(field_type) - # Only pass existing fields to PyArrow scanner to avoid errors self.reader = self.dataset.scanner( columns=self.existing_fields, filter=push_down_predicate, - batch_size=batch_size + batch_size=batch_size, ).to_reader() self._output_schema = ( @@ -63,9 +104,12 @@ def read_arrow_batch(self) -> Optional[RecordBatch]: try: batch = self.reader.read_next_batch() - if self._file_format == 'orc' and self._output_schema is not None: + if self._file_format == "orc" and self._output_schema is not None: batch = self._cast_orc_time_columns(batch) + if self._shredded_schemas: + batch = self._assemble_shredded_variants(batch) + if not self.missing_fields: return batch @@ -97,23 +141,43 @@ def _type_for_missing(name: str) -> pa.DataType: all_columns.append(missing_columns[column_idx]) nullable = not SpecialFields.is_system_field(field_name) out_fields.append(pa.field(field_name, col_type, nullable=nullable)) - # Create a new RecordBatch with all columns + return pa.RecordBatch.from_arrays(all_columns, schema=pa.schema(out_fields)) except StopIteration: return None + def _assemble_shredded_variants(self, batch: pa.RecordBatch) -> pa.RecordBatch: + """Replace shredded VARIANT columns with standard struct.""" + changed = False + columns = list(batch.columns) + fields = list(batch.schema) + + for i, f in enumerate(fields): + if f.name in self._shredded_schemas: + schema = self._shredded_schemas[f.name] + new_col = assemble_shredded_column(columns[i], schema) + columns[i] = new_col + fields[i] = pa.field(f.name, new_col.type, nullable=f.nullable) + changed = True + + if not changed: + return batch + return pa.RecordBatch.from_arrays(columns, schema=pa.schema(fields)) + def _cast_orc_time_columns(self, batch): - """Cast int32 TIME columns back to time32('ms') when reading ORC. - """ + """Cast int32 TIME columns back to time32('ms') when reading ORC.""" columns = [] fields = [] changed = False for i, name in enumerate(batch.schema.names): col = batch.column(i) idx = self._output_schema.get_field_index(name) - if idx >= 0 and pa.types.is_int32(col.type) \ - and pa.types.is_time(self._output_schema.field(idx).type): + if ( + idx >= 0 + and pa.types.is_int32(col.type) + and pa.types.is_time(self._output_schema.field(idx).type) + ): col = col.cast(self._output_schema.field(idx).type) fields.append(self._output_schema.field(idx)) changed = True diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index ebb5612c435c..d2271d9f0aff 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -454,6 +454,28 @@ def parse_data_field( ) +def is_variant_struct(pa_type: pyarrow.StructType) -> bool: + """Return True if *pa_type* is the two-field BINARY struct used to encode VARIANT. + + Paimon Java stores VARIANT as a Parquet GROUP with exactly two non-nullable + BINARY primitives: ``value`` (field index 0) and ``metadata`` (field index 1). + PyArrow surfaces this group as a struct type; we fingerprint it here so that + :meth:`PyarrowFieldParser.to_paimon_type` can round-trip it back to VARIANT + instead of misclassifying it as a generic ROW type. + + This heuristic is fragile by necessity — Arrow has no native Variant type yet. + It will not mis-fire on ordinary ROW fields as long as callers do not name two + non-nullable binary columns ``value`` / ``metadata`` at the same nesting level. + """ + if pa_type.num_fields != 2: + return False + f0, f1 = pa_type[0], pa_type[1] + return ( + f0.name == 'value' and pyarrow.types.is_binary(f0.type) and not f0.nullable + and f1.name == 'metadata' and pyarrow.types.is_binary(f1.type) and not f1.nullable + ) + + class PyarrowFieldParser: @staticmethod @@ -481,6 +503,11 @@ def from_paimon_type(data_type: DataType) -> pyarrow.DataType: return pyarrow.binary() elif type_name == 'BLOB': return pyarrow.large_binary() + elif type_name == 'VARIANT': + return pyarrow.struct([ + pyarrow.field('value', pyarrow.binary(), nullable=False), + pyarrow.field('metadata', pyarrow.binary(), nullable=False), + ]) elif type_name.startswith('DECIMAL'): if type_name == 'DECIMAL': return pyarrow.decimal128(10, 0) # default to 10, 0 @@ -591,6 +618,8 @@ def to_paimon_type(pa_type: pyarrow.DataType, nullable: bool) -> DataType: key_type = PyarrowFieldParser.to_paimon_type(pa_type.key_type, nullable) value_type = PyarrowFieldParser.to_paimon_type(pa_type.item_type, nullable) return MapType(nullable, key_type, value_type) + elif types.is_struct(pa_type) and is_variant_struct(pa_type): + return AtomicType('VARIANT', nullable) elif types.is_struct(pa_type): pa_type: pyarrow.StructType fields = [] diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index 3eee324b6c16..01baf771516b 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -25,6 +25,7 @@ import pyarrow as pa from parameterized import parameterized from pypaimon.catalog.catalog_factory import CatalogFactory +from pypaimon.data.generic_variant import GenericVariant from pypaimon.schema.schema import Schema from pypaimon.read.read_builder import ReadBuilder @@ -670,3 +671,195 @@ def test_compact_conflict_shard_update(self): self.assertIn("conflicts", str(ctx.exception)) tc.close() print(f"Conflict detected as expected: {ctx.exception}") + + def test_py_read_variant_table(self): + """Python reads a VARIANT-column table written by Java (Java→Python E2E).""" + table = self.catalog.get_table('default.variant_test') + read_builder = table.new_read_builder() + table_scan = read_builder.new_scan() + table_read = read_builder.new_read() + splits = table_scan.plan().splits() + result = table_read.to_arrow(splits) + + self.assertEqual(result.num_rows, 3) + + # VARIANT maps to struct + payload_field = result.schema.field('payload') + self.assertTrue(pa.types.is_struct(payload_field.type), + f"Expected struct type for VARIANT, got {payload_field.type}") + self.assertEqual(payload_field.type.num_fields, 2) + self.assertEqual(payload_field.type[0].name, 'value') + self.assertEqual(payload_field.type[1].name, 'metadata') + self.assertTrue(pa.types.is_binary(payload_field.type[0].type)) + self.assertTrue(pa.types.is_binary(payload_field.type[1].type)) + + # All rows should have non-null payload structs + payload_col = result.column('payload') + for i in range(result.num_rows): + row = payload_col[i].as_py() + self.assertIsNotNone(row, f"Row {i}: expected non-null VARIANT") + self.assertIn('value', row) + self.assertIn('metadata', row) + self.assertIsInstance(row['value'], bytes) + self.assertIsInstance(row['metadata'], bytes) + self.assertGreater(len(row['value']), 0) + + # Verify bytes are non-empty and can be decoded via GenericVariant + result_sorted = table_sort_by(result, 'id') + id_list = result_sorted.column('id').to_pylist() + payload_list = result_sorted.column('payload').to_pylist() + + # Row 1: Alice, {"age":30,"city":"Beijing"} + alice_data = GenericVariant.from_arrow_struct(payload_list[id_list.index(1)]).to_python() + self.assertEqual(alice_data['age'], 30) + self.assertEqual(alice_data['city'], 'Beijing') + + # Row 2: Bob, {"age":25,"city":"Shanghai"} + bob_data = GenericVariant.from_arrow_struct(payload_list[id_list.index(2)]).to_python() + self.assertEqual(bob_data['age'], 25) + self.assertEqual(bob_data['city'], 'Shanghai') + + # Row 3: Carol, [1,2,3] + carol_data = GenericVariant.from_arrow_struct(payload_list[id_list.index(3)]).to_python() + self.assertEqual(carol_data, [1, 2, 3]) + + print("test_py_read_variant_table: verified {} VARIANT rows".format(result.num_rows)) + + # Also verify shredded VARIANT: Java wrote variant_shredded_test with + # parquet.variant.shreddingSchema (age+city shredded). Python must reassemble + # the shredded Parquet columns back into standard struct. + # Requires Python >= 3.7 (variant_shredding module uses __future__ annotations). + if sys.version_info[:2] < (3, 7): + print("test_py_read_variant_table: skipping shredded VARIANT check (Python < 3.7)") + return + shredded_table = self.catalog.get_table('default.variant_shredded_test') + shredded_rb = shredded_table.new_read_builder() + shredded_result = shredded_rb.new_read().to_arrow( + shredded_rb.new_scan().plan().splits()) + self.assertEqual(shredded_result.num_rows, 3) + + # Assembled column must be the same struct shape + shredded_pf = shredded_result.schema.field('payload') + self.assertTrue(pa.types.is_struct(shredded_pf.type), + "shredded VARIANT should assemble to struct, got {}".format(shredded_pf.type)) + self.assertEqual(shredded_pf.type.num_fields, 2) + self.assertEqual(shredded_pf.type[0].name, 'value') + self.assertEqual(shredded_pf.type[1].name, 'metadata') + + # Verify decoded values match what Java wrote + shredded_sorted = table_sort_by(shredded_result, 'id') + shredded_ids = shredded_sorted.column('id').to_pylist() + shredded_payloads = shredded_sorted.column('payload').to_pylist() + + # Row 1: Alice {"age":30,"city":"Beijing"} — both fields were shredded + alice = GenericVariant.from_arrow_struct( + shredded_payloads[shredded_ids.index(1)]).to_python() + self.assertEqual(alice['age'], 30) + self.assertEqual(alice['city'], 'Beijing') + + # Row 2: Bob {"age":25,"city":"Shanghai"} — both fields were shredded + bob = GenericVariant.from_arrow_struct( + shredded_payloads[shredded_ids.index(2)]).to_python() + self.assertEqual(bob['age'], 25) + self.assertEqual(bob['city'], 'Shanghai') + + # Row 3: Carol [1,2,3] — array, no shredded fields; everything in overflow + carol = GenericVariant.from_arrow_struct( + shredded_payloads[shredded_ids.index(3)]).to_python() + self.assertEqual(carol, [1, 2, 3]) + + print("test_py_read_variant_table: verified {} shredded VARIANT rows".format( + shredded_result.num_rows)) + + def test_py_write_variant_table(self): + """Python writes a VARIANT-column table for Java to read back (Python→Java E2E). + + Data written: + id=1 payload={"name":"test","value":42} + id=2 payload=[10,20,30] + id=3 payload="hello" + id=4 payload=null + """ + variant_type = pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + pa_schema = pa.schema([ + ('id', pa.int32()), + ('name', pa.string()), + ('payload', variant_type), + ]) + schema = Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}) + + table_name = 'default.py_variant_test' + self.catalog.drop_table(table_name, True) + self.catalog.create_table(table_name, schema, False) + table = self.catalog.get_table(table_name) + + variant_col = GenericVariant.to_arrow_array([ + GenericVariant.from_json('{"name":"test","value":42}'), + GenericVariant.from_json('[10,20,30]'), + GenericVariant.from_json('"hello"'), + None, # SQL NULL at the column level, not a VARIANT containing JSON null + ]) + data = pa.table({ + 'id': pa.array([1, 2, 3, 4], type=pa.int32()), + 'name': pa.array(['row1', 'row2', 'row3', 'row4'], type=pa.string()), + 'payload': variant_col, + }, schema=pa_schema) + + write_builder = table.new_batch_write_builder() + table_write = write_builder.new_write() + table_commit = write_builder.new_commit() + table_write.write_arrow(data) + table_commit.commit(table_write.prepare_commit()) + table_write.close() + table_commit.close() + print("test_py_write_variant_table: wrote 4 VARIANT rows to {}".format(table_name)) + + # Also write a shredded VARIANT table (py_variant_shredded_test) for Java to read. + # Python shreds the 'age' (BIGINT) and 'city' (VARCHAR) sub-fields of 'payload' + # when writing Parquet. Java must reassemble the shredded columns on read. + # Requires Python >= 3.7 (variant_shredding module uses __future__ annotations). + if sys.version_info[:2] < (3, 7): + print("test_py_write_variant_table: skipping shredded VARIANT write (Python < 3.7)") + return + shredding_json = ( + '{"type":"ROW","fields":[{"name":"payload","type":{"type":"ROW","fields":[' + '{"name":"age","type":"BIGINT"},' + '{"name":"city","type":"VARCHAR"}' + ']}}]}' + ) + shredded_table_name = 'default.py_variant_shredded_test' + self.catalog.drop_table(shredded_table_name, True) + shredded_schema = Schema.from_pyarrow_schema( + pa_schema, + options={'bucket': '-1', 'variant.shreddingSchema': shredding_json} + ) + self.catalog.create_table(shredded_table_name, shredded_schema, False) + shredded_table = self.catalog.get_table(shredded_table_name) + + # Use data with age+city fields so the shredded sub-columns are exercised. + # Row 3 is an array — it has no age/city, so it goes entirely to overflow. + shredded_variant_col = GenericVariant.to_arrow_array([ + GenericVariant.from_json('{"age":30,"city":"Beijing"}'), + GenericVariant.from_json('{"age":25,"city":"Shanghai"}'), + GenericVariant.from_json('[1,2,3]'), + ]) + shredded_data = pa.table( + { + 'id': pa.array([1, 2, 3], type=pa.int32()), + 'name': pa.array(['Alice', 'Bob', 'Carol'], type=pa.string()), + 'payload': shredded_variant_col, + }, + schema=pa_schema, + ) + shredded_wb = shredded_table.new_batch_write_builder() + shredded_tw = shredded_wb.new_write() + shredded_tc = shredded_wb.new_commit() + shredded_tw.write_arrow(shredded_data) + shredded_tc.commit(shredded_tw.prepare_commit()) + shredded_tw.close() + shredded_tc.close() + print("test_py_write_variant_table: wrote 3 shredded VARIANT rows to {}".format( + shredded_table_name)) diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py new file mode 100644 index 000000000000..63a81deb9e7d --- /dev/null +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -0,0 +1,1188 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Unit tests for VARIANT type support in pypaimon. + +Covers the user-facing features described in the Python API documentation: + + - Plain VARIANT: ordinary write (GenericVariant → Parquet) and read. + - Shredded VARIANT: write-side decomposition (shred_variant_column) and + read-side transparent assembly (assemble_shredded_column). + +Sections +-------- +1. Type-system layer – schema parsing, Paimon ↔ Arrow type mapping. +2. GenericVariant container – construction, inspection, JSON round-trip. +3. Plain VARIANT I/O – PyArrow-level Parquet write/read sanity check. +4. Shredded schema utils – is_shredded_variant, parse_metadata_dict, + build_variant_schema. +5. Binary encoding helpers – _encode_scalar_to_value_bytes, _build_object_value, + _build_array_value. +6. Read-path assembly – rebuild_value, rebuild, assemble_shredded_column. +7. Write-path shredding – parse_shredding_schema_option, + shredding_schema_to_arrow_type, decompose_variant. +8. Full-chain via Paimon API – write then read through CatalogFactory / Schema / + write_arrow / to_arrow for both plain VARIANT and + shredded VARIANT. +""" + +import io +import json +import os +import shutil +import struct as _struct +import tempfile +import unittest + +import pyarrow as pa +import pyarrow.parquet as pq + +from pypaimon import CatalogFactory, Schema +from pypaimon.data.generic_variant import GenericVariant +from pypaimon.data.variant_shredding import ( + VARIANT_ARROW_TYPE, + VariantSchema, + _NULL_VALUE_BYTES, + _build_array_value, + _build_object_value, + _encode_scalar_to_value_bytes, + assemble_shredded_column, + build_variant_schema, + decompose_variant, + is_shredded_variant, + parse_metadata_dict, + parse_shredding_schema_option, + rebuild, + rebuild_value, + shredding_schema_to_arrow_type, +) +from pypaimon.schema.data_types import ( + AtomicType, + DataField, + DataTypeParser, + PyarrowFieldParser, + RowType, + is_variant_struct, +) + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +def _variant_arrow_type() -> pa.StructType: + """The canonical Arrow representation of a plain VARIANT column.""" + return pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + + +def _make_metadata(*keys: str) -> bytes: + """Build variant metadata bytes. With no keys returns the minimal header.""" + if keys: + return GenericVariant.from_python({k: None for k in keys}).metadata() + return b'\x01\x00' # version=1, dict_size=0 + + +def _scalar_sub_struct(arrow_type: pa.DataType) -> pa.StructType: + """Return struct for one scalar sub-field.""" + return pa.struct([ + pa.field('value', pa.binary(), nullable=True), + pa.field('typed_value', arrow_type, nullable=True), + ]) + + +def _object_shredded_type(*key_type_pairs) -> pa.StructType: + """Build a top-level shredded VARIANT struct with the given (key, arrow_type) sub-fields.""" + tv_fields = [ + pa.field(name, _scalar_sub_struct(atype), nullable=True) + for name, atype in key_type_pairs + ] + return pa.struct([ + pa.field('metadata', pa.binary(), nullable=False), + pa.field('value', pa.binary(), nullable=True), + pa.field('typed_value', pa.struct(tv_fields), nullable=True), + ]) + + +def _schema_json(col_name: str, sub_fields) -> str: + """Build a variant.shreddingSchema JSON string for the given column and sub-fields. + + Args: + col_name: VARIANT column name. + sub_fields: list of (field_name, paimon_type_str) pairs, e.g. + [('age', 'BIGINT'), ('city', 'VARCHAR')]. + """ + return json.dumps({ + 'type': 'ROW', + 'fields': [ + { + 'id': 0, + 'name': col_name, + 'type': { + 'type': 'ROW', + 'fields': [ + {'id': i, 'name': name, 'type': dtype} + for i, (name, dtype) in enumerate(sub_fields) + ], + }, + } + ], + }) + + +# =========================================================================== +# 1. Type-system layer +# =========================================================================== + +class TestVariantSchemaParsing(unittest.TestCase): + + def test_parse_variant_keyword(self): + dt = DataTypeParser.parse_atomic_type_sql_string('VARIANT') + self.assertIsInstance(dt, AtomicType) + self.assertEqual(dt.type, 'VARIANT') + self.assertTrue(dt.nullable) + + def test_parse_variant_not_null(self): + dt = DataTypeParser.parse_atomic_type_sql_string('VARIANT NOT NULL') + self.assertIsInstance(dt, AtomicType) + self.assertFalse(dt.nullable) + + def test_variant_to_dict_roundtrip(self): + dt = AtomicType('VARIANT') + restored = DataTypeParser.parse_data_type(dt.to_dict()) + self.assertEqual(dt, restored) + + def test_variant_str(self): + self.assertEqual(str(AtomicType('VARIANT')), 'VARIANT') + self.assertEqual(str(AtomicType('VARIANT', nullable=False)), 'VARIANT NOT NULL') + + +class TestVariantFromPaimonType(unittest.TestCase): + + def _arrow_type(self): + return PyarrowFieldParser.from_paimon_type(AtomicType('VARIANT')) + + def test_returns_struct(self): + self.assertTrue(pa.types.is_struct(self._arrow_type())) + self.assertEqual(self._arrow_type().num_fields, 2) + + def test_field_names(self): + t = self._arrow_type() + self.assertEqual(t.field(0).name, 'value') + self.assertEqual(t.field(1).name, 'metadata') + + def test_field_types_are_binary(self): + t = self._arrow_type() + self.assertTrue(pa.types.is_binary(t.field(0).type)) + self.assertTrue(pa.types.is_binary(t.field(1).type)) + + def test_fields_not_nullable(self): + t = self._arrow_type() + self.assertFalse(t.field(0).nullable) + self.assertFalse(t.field(1).nullable) + + def test_from_paimon_field(self): + df = DataField(id=0, name='payload', type=AtomicType('VARIANT')) + pa_field = PyarrowFieldParser.from_paimon_field(df) + self.assertEqual(pa_field.name, 'payload') + self.assertTrue(pa.types.is_struct(pa_field.type)) + self.assertTrue(pa_field.nullable) + + def test_from_paimon_schema(self): + fields = [ + DataField(id=0, name='id', type=AtomicType('BIGINT')), + DataField(id=1, name='payload', type=AtomicType('VARIANT')), + ] + schema = PyarrowFieldParser.from_paimon_schema(fields) + self.assertEqual(schema.field('payload').type, _variant_arrow_type()) + + +class TestVariantToPaimonType(unittest.TestCase): + + def test_is_variant_struct_positive(self): + self.assertTrue(is_variant_struct(_variant_arrow_type())) + + def test_is_variant_struct_wrong_names(self): + st = pa.struct([ + pa.field('val', pa.binary(), nullable=False), + pa.field('meta', pa.binary(), nullable=False), + ]) + self.assertFalse(is_variant_struct(st)) + + def test_is_variant_struct_nullable_fields(self): + st = pa.struct([ + pa.field('value', pa.binary(), nullable=True), + pa.field('metadata', pa.binary(), nullable=False), + ]) + self.assertFalse(is_variant_struct(st)) + + def test_is_variant_struct_wrong_types(self): + st = pa.struct([ + pa.field('value', pa.string(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + self.assertFalse(is_variant_struct(st)) + + def test_is_variant_struct_extra_fields(self): + st = pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + pa.field('typed_value', pa.int64(), nullable=True), + ]) + self.assertFalse(is_variant_struct(st)) + + def test_to_paimon_type_variant(self): + result = PyarrowFieldParser.to_paimon_type(_variant_arrow_type(), nullable=True) + self.assertIsInstance(result, AtomicType) + self.assertEqual(result.type, 'VARIANT') + self.assertTrue(result.nullable) + + def test_to_paimon_type_variant_not_null(self): + result = PyarrowFieldParser.to_paimon_type(_variant_arrow_type(), nullable=False) + self.assertFalse(result.nullable) + + def test_ordinary_struct_maps_to_row_type(self): + st = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string())]) + result = PyarrowFieldParser.to_paimon_type(st, nullable=True) + self.assertIsInstance(result, RowType) + + def test_struct_same_names_different_types_is_row_type(self): + st = pa.struct([ + pa.field('value', pa.string(), nullable=False), + pa.field('metadata', pa.string(), nullable=False), + ]) + result = PyarrowFieldParser.to_paimon_type(st, nullable=True) + self.assertIsInstance(result, RowType) + + +class TestVariantSchemaRoundTrip(unittest.TestCase): + + def test_paimon_to_arrow_to_paimon(self): + original = DataField(id=0, name='v', type=AtomicType('VARIANT')) + pa_field = PyarrowFieldParser.from_paimon_field(original) + restored = PyarrowFieldParser.to_paimon_type(pa_field.type, pa_field.nullable) + self.assertIsInstance(restored, AtomicType) + self.assertEqual(restored.type, 'VARIANT') + + def test_mixed_schema_round_trip(self): + fields = [ + DataField(id=0, name='id', type=AtomicType('BIGINT')), + DataField(id=1, name='payload', type=AtomicType('VARIANT')), + DataField(id=2, name='ts', type=AtomicType('TIMESTAMP(6)')), + ] + pa_schema = PyarrowFieldParser.from_paimon_schema(fields) + restored = PyarrowFieldParser.to_paimon_schema(pa_schema) + + self.assertEqual(restored[1].name, 'payload') + self.assertIsInstance(restored[1].type, AtomicType) + self.assertEqual(restored[1].type.type, 'VARIANT') + self.assertEqual(restored[2].name, 'ts') + + +# =========================================================================== +# 2. GenericVariant container +# =========================================================================== + +class TestGenericVariantContainer(unittest.TestCase): + + def test_from_json_returns_instance(self): + gv = GenericVariant.from_json('{"age":30}') + self.assertIsInstance(gv, GenericVariant) + self.assertIsInstance(gv.value(), bytes) + self.assertIsInstance(gv.metadata(), bytes) + self.assertGreater(len(gv.value()), 0) + self.assertGreater(len(gv.metadata()), 0) + + def test_from_python_returns_instance(self): + gv = GenericVariant.from_python({'a': 1, 'b': 'hello'}) + self.assertIsInstance(gv, GenericVariant) + self.assertIsInstance(gv.value(), bytes) + + def test_from_arrow_struct_roundtrip(self): + original = GenericVariant.from_json('{"x":1,"y":2}') + restored = GenericVariant.from_arrow_struct( + {'value': original.value(), 'metadata': original.metadata()}) + self.assertEqual(restored.value(), original.value()) + self.assertEqual(restored.metadata(), original.metadata()) + + def test_to_json_roundtrip(self): + json_str = '{"age":30,"city":"Beijing"}' + gv = GenericVariant.from_json(json_str) + parsed = json.loads(gv.to_json()) + self.assertEqual(parsed, json.loads(json_str)) + + def test_to_python_object(self): + gv = GenericVariant.from_json('{"age":30,"city":"Beijing"}') + result = gv.to_python() + self.assertEqual(result, {'age': 30, 'city': 'Beijing'}) + + def test_to_python_array(self): + gv = GenericVariant.from_json('[1,2,3]') + self.assertEqual(gv.to_python(), [1, 2, 3]) + + def test_to_python_null(self): + gv = GenericVariant.from_json('null') + self.assertIsNone(gv.to_python()) + + def test_to_python_string(self): + gv = GenericVariant.from_json('"hello"') + self.assertEqual(gv.to_python(), 'hello') + + def test_to_python_number(self): + gv = GenericVariant.from_json('42') + self.assertEqual(gv.to_python(), 42) + + def test_from_python_none(self): + gv = GenericVariant.from_python(None) + self.assertIsNone(gv.to_python()) + + def test_from_python_nested(self): + obj = {'user': {'name': 'Alice', 'scores': [10, 20, 30]}, 'active': True} + gv = GenericVariant.from_python(obj) + result = gv.to_python() + self.assertEqual(result['user']['name'], 'Alice') + self.assertEqual(result['user']['scores'], [10, 20, 30]) + self.assertTrue(result['active']) + + def test_equality(self): + gv1 = GenericVariant.from_json('{"a":1}') + gv2 = GenericVariant.from_json('{"a":1}') + self.assertEqual(gv1, gv2) + + def test_repr_and_str(self): + gv = GenericVariant.from_json('"hello"') + self.assertIn('hello', repr(gv)) + self.assertIn('hello', str(gv)) + + +class TestToArrowArray(unittest.TestCase): + + def test_basic(self): + gv1 = GenericVariant.from_json('{"a":1}') + gv2 = GenericVariant.from_json('[1,2]') + arr = GenericVariant.to_arrow_array([gv1, gv2]) + self.assertIsInstance(arr, pa.StructArray) + self.assertEqual(len(arr), 2) + restored = GenericVariant.from_arrow_struct(arr[0].as_py()) + self.assertEqual(restored.to_python(), {'a': 1}) + + def test_with_nulls(self): + arr = GenericVariant.to_arrow_array([GenericVariant.from_json('42'), None]) + self.assertEqual(len(arr), 2) + self.assertIsNotNone(arr[0].as_py()) + self.assertIsNone(arr[1].as_py()) + + def test_empty(self): + self.assertEqual(len(GenericVariant.to_arrow_array([])), 0) + + def test_arrow_type(self): + gv = GenericVariant.from_json('true') + arr = GenericVariant.to_arrow_array([gv]) + self.assertEqual(arr.type, _variant_arrow_type()) + + +class TestJsonRoundtrip(unittest.TestCase): + + def _check(self, json_str): + gv = GenericVariant.from_json(json_str) + self.assertEqual(json.loads(gv.to_json()), json.loads(json_str)) + + def test_nested_object_array(self): + self._check('{"users":[{"name":"Alice","age":30},{"name":"Bob","age":25}]}') + + def test_deep_nesting(self): + self._check('{"a":{"b":{"c":{"d":42}}}}') + + def test_array_of_objects(self): + self._check('[{"x":1},{"x":2},{"x":3}]') + + def test_all_primitive_types(self): + self._check('{"n":null,"b":true,"i":42,"s":"hello","f":1.5}') + + def test_empty_object(self): + gv = GenericVariant.from_json('{}') + self.assertEqual(gv.to_python(), {}) + + def test_empty_array(self): + gv = GenericVariant.from_json('[]') + self.assertEqual(gv.to_python(), []) + + +# =========================================================================== +# 3. Plain VARIANT — PyArrow-level Parquet sanity check +# (verifies the physical struct layout works in Parquet) +# =========================================================================== + +def _make_variant_bytes(json_str: str) -> bytes: + """Produce a minimal VARIANT value payload encoding a long-string primitive.""" + payload = json_str.encode('utf-8') + return _struct.pack(' bytes to Parquet and verify schema + values.""" + meta = _make_metadata() + payload_col = pa.array( + [{'value': _make_variant_bytes('{"key":"hello"}'), 'metadata': meta}, + {'value': _make_variant_bytes('42'), 'metadata': meta}], + type=_variant_arrow_type(), + ) + original = pa.table({'id': [1, 2], 'payload': payload_col}) + buf = io.BytesIO() + pq.write_table(original, buf) + buf.seek(0) + restored = pq.read_table(buf) + + self.assertEqual(restored.schema.field('payload').type, _variant_arrow_type()) + self.assertEqual(restored.num_rows, 2) + row0 = restored.column('payload')[0].as_py() + self.assertEqual(row0['value'], _make_variant_bytes('{"key":"hello"}')) + self.assertEqual(row0['metadata'], meta) + + def test_null_variant_row(self): + """SQL-NULL VARIANT row survives Parquet round-trip as None.""" + payload_col = pa.array( + [None, {'value': _make_variant_bytes('true'), 'metadata': _make_metadata()}], + type=_variant_arrow_type(), + ) + buf = io.BytesIO() + pq.write_table(pa.table({'id': [1, 2], 'payload': payload_col}), buf) + buf.seek(0) + restored = pq.read_table(buf) + self.assertIsNone(restored.column('payload')[0].as_py()) + self.assertIsNotNone(restored.column('payload')[1].as_py()) + + def test_to_arrow_array_roundtrip_parquet(self): + """GenericVariant.to_arrow_array() produces bytes that survive a Parquet cycle.""" + gvs = [ + GenericVariant.from_python({'score': 1}), + GenericVariant.from_python({'score': 2}), + None, + ] + col = GenericVariant.to_arrow_array(gvs) + buf = io.BytesIO() + pq.write_table(pa.table({'v': col}), buf) + buf.seek(0) + restored = pq.read_table(buf).column('v') + self.assertEqual( + GenericVariant.from_arrow_struct(restored[0].as_py()).to_python(), {'score': 1}) + self.assertEqual( + GenericVariant.from_arrow_struct(restored[1].as_py()).to_python(), {'score': 2}) + self.assertIsNone(restored[2].as_py()) + + +# =========================================================================== +# 4. Shredded VARIANT — schema detection and parsing +# =========================================================================== + +class TestIsShredded(unittest.TestCase): + + def test_shredded_is_detected(self): + t = _object_shredded_type(('age', pa.int64())) + self.assertTrue(is_shredded_variant(t)) + + def test_plain_variant_not_shredded(self): + plain = pa.struct([ + pa.field('value', pa.binary(), nullable=False), + pa.field('metadata', pa.binary(), nullable=False), + ]) + self.assertFalse(is_shredded_variant(plain)) + + def test_arbitrary_struct_not_shredded(self): + t = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string())]) + self.assertFalse(is_shredded_variant(t)) + + def test_non_struct_not_shredded(self): + self.assertFalse(is_shredded_variant(pa.binary())) + self.assertFalse(is_shredded_variant(pa.int64())) + + def test_missing_typed_value_not_shredded(self): + t = pa.struct([ + pa.field('metadata', pa.binary()), + pa.field('value', pa.binary()), + ]) + self.assertFalse(is_shredded_variant(t)) + + +class TestParseMetadataDict(unittest.TestCase): + + def test_single_key(self): + meta = _make_metadata('age') + d = parse_metadata_dict(meta) + self.assertIn('age', d) + self.assertEqual(d['age'], 0) + + def test_multiple_keys(self): + meta = _make_metadata('age', 'name', 'city') + d = parse_metadata_dict(meta) + self.assertEqual(set(d.keys()), {'age', 'name', 'city'}) + self.assertEqual(d['age'], 0) + self.assertEqual(d['name'], 1) + self.assertEqual(d['city'], 2) + + def test_empty_metadata(self): + meta = b'\x01\x00' + d = parse_metadata_dict(meta) + self.assertEqual(d, {}) + + def test_empty_bytes(self): + d = parse_metadata_dict(b'') + self.assertEqual(d, {}) + + +class TestBuildVariantSchema(unittest.TestCase): + + def test_simple_scalar(self): + t = _object_shredded_type(('age', pa.int64()), ('name', pa.string())) + schema = build_variant_schema(t) + self.assertEqual(schema.metadata_idx, 0) + self.assertEqual(schema.value_idx, 1) + self.assertEqual(schema.typed_idx, 2) + self.assertIsNotNone(schema.object_fields) + self.assertEqual(len(schema.object_fields), 2) + self.assertEqual(schema.object_fields[0].field_name, 'age') + self.assertEqual(schema.object_fields[1].field_name, 'name') + + def test_sub_field_scalar_type(self): + t = _object_shredded_type(('age', pa.int64())) + schema = build_variant_schema(t) + age_schema = schema.object_fields[0].schema + self.assertEqual(age_schema.scalar_arrow_type, pa.int64()) + self.assertEqual(age_schema.value_idx, 0) + self.assertEqual(age_schema.typed_idx, 1) + + def test_object_schema_map(self): + t = _object_shredded_type(('age', pa.int64()), ('name', pa.string())) + schema = build_variant_schema(t) + self.assertIn('age', schema.object_schema_map) + self.assertIn('name', schema.object_schema_map) + + def test_is_unshredded_for_plain_variant(self): + schema = VariantSchema(metadata_idx=0, value_idx=1) + self.assertTrue(schema.is_unshredded()) + + def test_not_unshredded_for_shredded_schema(self): + t = _object_shredded_type(('age', pa.int64())) + schema = build_variant_schema(t) + self.assertFalse(schema.is_unshredded()) + + def test_no_typed_value_field(self): + plain = pa.struct([ + pa.field('metadata', pa.binary(), nullable=False), + pa.field('value', pa.binary(), nullable=True), + ]) + schema = build_variant_schema(plain) + self.assertEqual(schema.metadata_idx, 0) + self.assertEqual(schema.value_idx, 1) + self.assertEqual(schema.typed_idx, -1) + self.assertTrue(schema.is_unshredded()) + + +# =========================================================================== +# 5. Binary encoding helpers +# =========================================================================== + +class TestEncodeScalar(unittest.TestCase): + + def _roundtrip(self, json_str: str, arrow_type: pa.DataType): + """Encode a scalar to bytes via the Arrow type, then decode via GenericVariant.""" + gv_orig = GenericVariant.from_json(json_str) + typed_value = pa.array([gv_orig.to_python()], type=arrow_type).to_pylist()[0] + value_bytes = _encode_scalar_to_value_bytes(typed_value, arrow_type) + gv = GenericVariant(value_bytes, b'\x01\x00') + return gv.to_python() + + def test_int(self): + self.assertEqual(self._roundtrip('42', pa.int64()), 42) + + def test_float(self): + value_bytes = _encode_scalar_to_value_bytes(3.14, pa.float64()) + gv = GenericVariant(value_bytes, b'\x01\x00') + self.assertAlmostEqual(gv.to_python(), 3.14, places=5) + + def test_bool_true(self): + self.assertEqual(self._roundtrip('true', pa.bool_()), True) + + def test_bool_false(self): + self.assertEqual(self._roundtrip('false', pa.bool_()), False) + + def test_string(self): + self.assertEqual(self._roundtrip('"hello"', pa.string()), 'hello') + + def test_null(self): + value_bytes = _encode_scalar_to_value_bytes(None, pa.int64()) + gv = GenericVariant(value_bytes, b'\x01\x00') + self.assertIsNone(gv.to_python()) + + +class TestBuildBinary(unittest.TestCase): + + def test_build_object_empty(self): + obj_bytes = _build_object_value([]) + gv = GenericVariant(obj_bytes, b'\x01\x00') + self.assertEqual(gv.to_python(), {}) + + def test_build_object_one_field(self): + meta = _make_metadata('age') + key_dict = parse_metadata_dict(meta) + age_val = _encode_scalar_to_value_bytes(30, pa.int64()) + obj_bytes = _build_object_value([(key_dict['age'], age_val)]) + gv = GenericVariant(obj_bytes, meta) + self.assertEqual(gv.to_python(), {'age': 30}) + + def test_build_array_empty(self): + arr_bytes = _build_array_value([]) + gv = GenericVariant(arr_bytes, b'\x01\x00') + self.assertEqual(gv.to_python(), []) + + def test_build_array_three_ints(self): + elem1 = _encode_scalar_to_value_bytes(1, pa.int64()) + elem2 = _encode_scalar_to_value_bytes(2, pa.int64()) + elem3 = _encode_scalar_to_value_bytes(3, pa.int64()) + arr_bytes = _build_array_value([elem1, elem2, elem3]) + gv = GenericVariant(arr_bytes, b'\x01\x00') + self.assertEqual(gv.to_python(), [1, 2, 3]) + + def test_build_array_with_null(self): + elem1 = _encode_scalar_to_value_bytes(1, pa.int64()) + arr_bytes = _build_array_value([elem1, _NULL_VALUE_BYTES]) + gv = GenericVariant(arr_bytes, b'\x01\x00') + result = gv.to_python() + self.assertEqual(result[0], 1) + self.assertIsNone(result[1]) + + +# =========================================================================== +# 6. Read-path assembly +# =========================================================================== + +class TestRebuildValue(unittest.TestCase): + """Test the core rebuild_value() function for various shredding scenarios.""" + + def _age_schema(self) -> VariantSchema: + """Schema for a sub-field struct {value: binary, typed_value: int64}.""" + return build_variant_schema(_scalar_sub_struct(pa.int64())) + + def _metadata_and_dict(self, *keys: str): + meta = _make_metadata(*keys) + return meta, parse_metadata_dict(meta) + + def test_scalar_typed_value(self): + schema = self._age_schema() + row = {'value': None, 'typed_value': 30} + result = rebuild_value(row, schema, {}) + gv = GenericVariant(result, b'\x01\x00') + self.assertEqual(gv.to_python(), 30) + + def test_scalar_overflow(self): + """When typed_value is None, fall back to overflow value bytes.""" + age_val = _encode_scalar_to_value_bytes(42, pa.int64()) + schema = self._age_schema() + row = {'value': age_val, 'typed_value': None} + result = rebuild_value(row, schema, {}) + gv = GenericVariant(result, b'\x01\x00') + self.assertEqual(gv.to_python(), 42) + + def test_both_null_returns_none(self): + """Both value and typed_value being None signals an absent field.""" + schema = self._age_schema() + row = {'value': None, 'typed_value': None} + self.assertIsNone(rebuild_value(row, schema, {})) + + def test_object_with_shredded_fields(self): + meta, key_dict = self._metadata_and_dict('age', 'name') + t = _object_shredded_type(('age', pa.int64()), ('name', pa.string())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': None, + 'typed_value': { + 'age': {'value': None, 'typed_value': 30}, + 'name': {'value': None, 'typed_value': 'Alice'}, + }, + } + value_bytes, _ = rebuild(row, schema, key_dict) + gv = GenericVariant(value_bytes, meta) + result = gv.to_python() + self.assertEqual(result['age'], 30) + self.assertEqual(result['name'], 'Alice') + + def test_object_absent_field_skipped(self): + """A sub-field with both value=None and typed_value=None is omitted from output.""" + meta, key_dict = self._metadata_and_dict('age', 'name') + t = _object_shredded_type(('age', pa.int64()), ('name', pa.string())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': None, + 'typed_value': { + 'age': {'value': None, 'typed_value': 25}, + 'name': {'value': None, 'typed_value': None}, # absent + }, + } + value_bytes, _ = rebuild(row, schema, key_dict) + gv = GenericVariant(value_bytes, meta) + result = gv.to_python() + self.assertEqual(result['age'], 25) + self.assertNotIn('name', result) + + def test_object_with_overflow(self): + """Fields not in typed_value are preserved from overflow bytes.""" + original = GenericVariant.from_json('{"age": 30, "extra": "overflow_val"}') + overflow_bytes = original.value() + meta = original.metadata() + key_dict = parse_metadata_dict(meta) + + t = _object_shredded_type(('age', pa.int64())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': overflow_bytes, + 'typed_value': { + 'age': {'value': None, 'typed_value': 30}, + }, + } + value_bytes = rebuild_value(row, schema, key_dict) + gv = GenericVariant(value_bytes, meta) + result = gv.to_python() + self.assertEqual(result['age'], 30) + self.assertIn('extra', result) + + def test_typed_value_null_uses_overflow(self): + """If typed_value struct is None for the whole row, full overflow bytes are used.""" + original = GenericVariant.from_json('{"age": 99}') + meta = original.metadata() + key_dict = parse_metadata_dict(meta) + + t = _object_shredded_type(('age', pa.int64())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': original.value(), + 'typed_value': None, # typed_value absent for this row + } + value_bytes, _ = rebuild(row, schema, key_dict) + gv = GenericVariant(value_bytes, meta) + self.assertEqual(gv.to_python()['age'], 99) + + def test_rebuild_matches_direct_variant(self): + """Bytes rebuilt from shredded form must equal bytes from GenericVariant.from_json.""" + original_json = '{"score": 42, "tag": "test"}' + original_gv = GenericVariant.from_json(original_json) + meta = original_gv.metadata() + key_dict = parse_metadata_dict(meta) + + t = _object_shredded_type(('score', pa.int64()), ('tag', pa.string())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': None, + 'typed_value': { + 'score': {'value': None, 'typed_value': 42}, + 'tag': {'value': None, 'typed_value': 'test'}, + }, + } + value_bytes, _ = rebuild(row, schema, key_dict) + gv_rebuilt = GenericVariant(value_bytes, meta) + self.assertEqual(gv_rebuilt.to_python(), original_gv.to_python()) + + +class TestRebuild(unittest.TestCase): + + def test_missing_metadata_raises(self): + schema = VariantSchema() + with self.assertRaises(ValueError): + rebuild({'value': None, 'typed_value': None}, schema) + + def test_basic_roundtrip(self): + meta = _make_metadata('x') + key_dict = parse_metadata_dict(meta) + t = _object_shredded_type(('x', pa.int64())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': None, + 'typed_value': {'x': {'value': None, 'typed_value': 7}}, + } + value_bytes, ret_meta = rebuild(row, schema, key_dict) + self.assertEqual(ret_meta, meta) + gv = GenericVariant(value_bytes, ret_meta) + self.assertEqual(gv.to_python(), {'x': 7}) + + def test_auto_parse_key_dict(self): + """When key_dict is None, rebuild() parses it automatically from the metadata field.""" + meta = _make_metadata('y') + t = _object_shredded_type(('y', pa.string())) + schema = build_variant_schema(t) + row = { + 'metadata': meta, + 'value': None, + 'typed_value': {'y': {'value': None, 'typed_value': 'hi'}}, + } + value_bytes, _ = rebuild(row, schema) + gv = GenericVariant(value_bytes, meta) + self.assertEqual(gv.to_python(), {'y': 'hi'}) + + +class TestAssembleShreddedColumn(unittest.TestCase): + + def test_basic_two_rows(self): + """assemble_shredded_column converts a shredded Arrow column to plain VARIANT.""" + meta = _make_metadata('age', 'name') + t = _object_shredded_type(('age', pa.int64()), ('name', pa.string())) + rows = [ + {'metadata': meta, 'value': None, 'typed_value': { + 'age': {'value': None, 'typed_value': 30}, + 'name': {'value': None, 'typed_value': 'Alice'}, + }}, + {'metadata': meta, 'value': None, 'typed_value': { + 'age': {'value': None, 'typed_value': 25}, + 'name': {'value': None, 'typed_value': 'Bob'}, + }}, + ] + col = pa.array(rows, type=t) + schema = build_variant_schema(col.type) + result = assemble_shredded_column(col, schema) + + self.assertEqual(result.type, VARIANT_ARROW_TYPE) + self.assertEqual(len(result), 2) + + gv0 = GenericVariant.from_arrow_struct(result[0].as_py()) + self.assertEqual(gv0.to_python(), {'age': 30, 'name': 'Alice'}) + + gv1 = GenericVariant.from_arrow_struct(result[1].as_py()) + self.assertEqual(gv1.to_python(), {'age': 25, 'name': 'Bob'}) + + +# =========================================================================== +# 7. Write-path shredding +# =========================================================================== + +class TestShreddingWrite(unittest.TestCase): + """Tests for the static-mode write shredding API.""" + + def _obj_fields_for(self, col_name, sub_fields): + """Helper: build obj_fields for a single column from (name, type) pairs.""" + return parse_shredding_schema_option(_schema_json(col_name, sub_fields))[col_name] + + # ----------------------------------------------------------------------- + # parse_shredding_schema_option + # ----------------------------------------------------------------------- + + def test_parse_single_scalar_field(self): + result = parse_shredding_schema_option(json.dumps({ + 'type': 'ROW', + 'fields': [ + {'id': 0, 'name': 'payload', 'type': { + 'type': 'ROW', + 'fields': [{'id': 0, 'name': 'age', 'type': 'INT'}], + }}, + ], + })) + self.assertIn('payload', result) + obj_fields = result['payload'] + self.assertEqual(len(obj_fields), 1) + self.assertEqual(obj_fields[0].field_name, 'age') + self.assertIsNotNone(obj_fields[0].schema.scalar_arrow_type) + + def test_parse_multiple_fields_multiple_cols(self): + result = parse_shredding_schema_option(json.dumps({ + 'type': 'ROW', + 'fields': [ + {'id': 0, 'name': 'col_a', 'type': {'type': 'ROW', 'fields': [ + {'id': 0, 'name': 'x', 'type': 'BIGINT'}, + {'id': 1, 'name': 'y', 'type': 'VARCHAR'}, + ]}}, + {'id': 1, 'name': 'col_b', 'type': {'type': 'ROW', 'fields': [ + {'id': 0, 'name': 'flag', 'type': 'BOOLEAN'}, + ]}}, + ], + })) + self.assertIn('col_a', result) + self.assertIn('col_b', result) + self.assertEqual(len(result['col_a']), 2) + self.assertEqual(result['col_a'][0].field_name, 'x') + self.assertEqual(result['col_a'][1].field_name, 'y') + self.assertEqual(result['col_b'][0].field_name, 'flag') + + def test_parse_invalid_top_level_type_raises(self): + with self.assertRaises(ValueError): + parse_shredding_schema_option(json.dumps({'type': 'ARRAY', 'element': 'INT'})) + + # ----------------------------------------------------------------------- + # shredding_schema_to_arrow_type + # ----------------------------------------------------------------------- + + def test_arrow_type_structure_single_scalar(self): + obj_fields = self._obj_fields_for('col', [('score', 'BIGINT')]) + arrow_type = shredding_schema_to_arrow_type(obj_fields) + + self.assertTrue(pa.types.is_struct(arrow_type)) + names = {arrow_type.field(i).name for i in range(arrow_type.num_fields)} + self.assertIn('metadata', names) + self.assertIn('value', names) + self.assertIn('typed_value', names) + + tv = arrow_type.field('typed_value').type + self.assertTrue(pa.types.is_struct(tv)) + self.assertEqual(tv.num_fields, 1) + self.assertEqual(tv.field(0).name, 'score') + + sub = tv.field('score').type + self.assertTrue(pa.types.is_struct(sub)) + sub_names = {sub.field(i).name for i in range(sub.num_fields)} + self.assertIn('value', sub_names) + self.assertIn('typed_value', sub_names) + + # ----------------------------------------------------------------------- + # decompose_variant + # ----------------------------------------------------------------------- + + def test_decompose_scalar_field_extracted(self): + """A shredded scalar field ends up in typed_value; non-shredded fields go to overflow.""" + obj_fields = self._obj_fields_for('col', [('age', 'BIGINT')]) + gv = GenericVariant.from_python({'age': 25, 'name': 'alice'}) + result = decompose_variant(gv, obj_fields) + + self.assertIn('metadata', result) + self.assertIn('value', result) + self.assertIn('typed_value', result) + + tv = result['typed_value'] + self.assertIn('age', tv) + self.assertIsNone(tv['age']['value']) + self.assertEqual(tv['age']['typed_value'], 25) + + # 'name' not in schema → must appear in overflow + overflow = result['value'] + if overflow: + overflow_gv = GenericVariant(overflow, result['metadata']) + self.assertIn('name', overflow_gv.to_python()) + + def test_decompose_absent_field_is_null(self): + """A shredded field absent from the variant yields {value: None, typed_value: None}.""" + obj_fields = self._obj_fields_for('col', [('missing_field', 'BIGINT')]) + gv = GenericVariant.from_python({'other': 99}) + result = decompose_variant(gv, obj_fields) + tv = result['typed_value'] + self.assertIsNone(tv['missing_field']['value']) + self.assertIsNone(tv['missing_field']['typed_value']) + + def test_decompose_non_object_variant_all_overflow(self): + """A non-object variant goes fully into overflow; typed_value is null (not a struct). + + Java's ShreddingUtils.rebuild requires typed_value to be null (not a non-null struct + with all-null sub-fields) when the variant is not an object, so that the overflow + binary can be consumed directly without expecting it to be an OBJECT variant. + """ + obj_fields = self._obj_fields_for('col', [('x', 'BIGINT')]) + gv = GenericVariant.from_python(42) + result = decompose_variant(gv, obj_fields) + self.assertIsNotNone(result['value']) + self.assertIsNone(result['typed_value']) + + def test_decompose_multiple_typed_fields(self): + """Multiple shredded fields are all correctly split from the variant.""" + obj_fields = self._obj_fields_for('col', [('score', 'BIGINT'), ('tag', 'VARCHAR')]) + gv = GenericVariant.from_python({'score': 100, 'tag': 'gold', 'extra': True}) + result = decompose_variant(gv, obj_fields) + + tv = result['typed_value'] + self.assertEqual(tv['score']['typed_value'], 100) + self.assertEqual(tv['tag']['typed_value'], 'gold') + # 'extra' must be in overflow + overflow = result['value'] + if overflow: + overflow_py = GenericVariant(overflow, result['metadata']).to_python() + self.assertIn('extra', overflow_py) + + +# =========================================================================== +# 8. Full-chain via Paimon API +# Write and read using CatalogFactory / Schema / write_arrow / to_arrow — +# the same code path used by real applications. +# =========================================================================== + +class TestVariantPaimonTable(unittest.TestCase): + """End-to-end tests that use the real Paimon Python API. + + Each test creates its own table (unique name) within a shared in-process + warehouse so that tests are fully independent. + """ + + @classmethod + def setUpClass(cls): + cls.tmpdir = tempfile.mkdtemp() + warehouse = os.path.join(cls.tmpdir, 'warehouse') + cls.catalog = CatalogFactory.create({'warehouse': warehouse}) + cls.catalog.create_database('default', True) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir, ignore_errors=True) + + def _write_and_read(self, table, pa_data: pa.Table) -> pa.Table: + """Write pa_data to the table and read all rows back.""" + write_builder = table.new_batch_write_builder() + table_write = write_builder.new_write() + table_commit = write_builder.new_commit() + table_write.write_arrow(pa_data) + table_commit.commit(table_write.prepare_commit()) + table_write.close() + table_commit.close() + + read_builder = table.new_read_builder() + splits = read_builder.new_scan().plan().splits() + return read_builder.new_read().to_arrow(splits) + + def _pa_schema(self): + return pa.schema([ + pa.field('id', pa.int64()), + pa.field('payload', _variant_arrow_type()), + ]) + + def test_plain_variant_write_and_read(self): + """Plain VARIANT: GenericVariant → write_arrow → to_arrow → GenericVariant.""" + schema = Schema.from_pyarrow_schema(self._pa_schema()) + self.catalog.create_table('default.plain_variant', schema, False) + table = self.catalog.get_table('default.plain_variant') + + gvs = [ + GenericVariant.from_python({'age': 30, 'city': 'Beijing'}), + GenericVariant.from_python({'score': 99, 'active': True}), + GenericVariant.from_json('[1, 2, 3]'), + ] + data = pa.table( + {'id': [1, 2, 3], 'payload': GenericVariant.to_arrow_array(gvs)}, + schema=self._pa_schema(), + ) + result = self._write_and_read(table, data) + + self.assertEqual(result.num_rows, 3) + payload_col = result.column('payload') + + gv0 = GenericVariant.from_arrow_struct(payload_col[0].as_py()) + self.assertEqual(gv0.to_python(), {'age': 30, 'city': 'Beijing'}) + + gv1 = GenericVariant.from_arrow_struct(payload_col[1].as_py()) + self.assertEqual(gv1.to_python(), {'score': 99, 'active': True}) + + gv2 = GenericVariant.from_arrow_struct(payload_col[2].as_py()) + self.assertEqual(gv2.to_python(), [1, 2, 3]) + + def test_plain_variant_null_row(self): + """SQL-NULL VARIANT rows are stored and retrieved as None.""" + schema = Schema.from_pyarrow_schema(self._pa_schema()) + self.catalog.create_table('default.plain_variant_null', schema, False) + table = self.catalog.get_table('default.plain_variant_null') + + gvs = [GenericVariant.from_python({'x': 1}), None, GenericVariant.from_python({'x': 3})] + data = pa.table( + {'id': [1, 2, 3], 'payload': GenericVariant.to_arrow_array(gvs)}, + schema=self._pa_schema(), + ) + result = self._write_and_read(table, data) + + payload_col = result.column('payload') + self.assertIsNotNone(payload_col[0].as_py()) + self.assertIsNone(payload_col[1].as_py()) + self.assertIsNotNone(payload_col[2].as_py()) + + def test_shredded_variant_write_and_read(self): + """Shredded VARIANT: writer shreds automatically, reader assembles transparently.""" + shredding_json = _schema_json('payload', [('age', 'BIGINT'), ('city', 'VARCHAR')]) + schema = Schema.from_pyarrow_schema( + self._pa_schema(), + options={'variant.shreddingSchema': shredding_json}, + ) + self.catalog.create_table('default.shredded_variant', schema, False) + table = self.catalog.get_table('default.shredded_variant') + + gvs = [ + GenericVariant.from_python({'age': 28, 'city': 'Beijing'}), + GenericVariant.from_python({'age': 35, 'city': 'Shanghai'}), + ] + data = pa.table( + {'id': [1, 2], 'payload': GenericVariant.to_arrow_array(gvs)}, + schema=self._pa_schema(), + ) + result = self._write_and_read(table, data) + + self.assertEqual(result.num_rows, 2) + payload_col = result.column('payload') + + py0 = GenericVariant.from_arrow_struct(payload_col[0].as_py()).to_python() + self.assertEqual(py0['age'], 28) + self.assertEqual(py0['city'], 'Beijing') + + py1 = GenericVariant.from_arrow_struct(payload_col[1].as_py()).to_python() + self.assertEqual(py1['age'], 35) + self.assertEqual(py1['city'], 'Shanghai') + + def test_shredded_variant_overflow_preserved(self): + """Fields outside the shredding schema survive in overflow bytes end-to-end.""" + shredding_json = _schema_json('payload', [('age', 'BIGINT')]) + schema = Schema.from_pyarrow_schema( + self._pa_schema(), + options={'variant.shreddingSchema': shredding_json}, + ) + self.catalog.create_table('default.shredded_overflow', schema, False) + table = self.catalog.get_table('default.shredded_overflow') + + # 'city' and 'active' are NOT in the shredding schema → stored as overflow + gv = GenericVariant.from_python({'age': 30, 'city': 'Beijing', 'active': True}) + data = pa.table( + {'id': [1], 'payload': GenericVariant.to_arrow_array([gv])}, + schema=self._pa_schema(), + ) + result = self._write_and_read(table, data) + + py = GenericVariant.from_arrow_struct(result.column('payload')[0].as_py()).to_python() + self.assertEqual(py['age'], 30) + self.assertEqual(py['city'], 'Beijing') + self.assertEqual(py['active'], True) + + def test_shredded_variant_null_row(self): + """SQL-NULL VARIANT rows survive the shred → assemble cycle as None.""" + shredding_json = _schema_json('payload', [('x', 'BIGINT')]) + schema = Schema.from_pyarrow_schema( + self._pa_schema(), + options={'variant.shreddingSchema': shredding_json}, + ) + self.catalog.create_table('default.shredded_null', schema, False) + table = self.catalog.get_table('default.shredded_null') + + gvs = [GenericVariant.from_python({'x': 7}), None] + data = pa.table( + {'id': [1, 2], 'payload': GenericVariant.to_arrow_array(gvs)}, + schema=self._pa_schema(), + ) + result = self._write_and_read(table, data) + + payload_col = result.column('payload') + gv = GenericVariant.from_arrow_struct(payload_col[0].as_py()) + self.assertEqual(gv.to_python(), {'x': 7}) + self.assertIsNone(payload_col[1].as_py()) + + +if __name__ == '__main__': + unittest.main() diff --git a/paimon-python/pypaimon/write/writer/data_blob_writer.py b/paimon-python/pypaimon/write/writer/data_blob_writer.py index 62cbd013ece5..d170913bd917 100644 --- a/paimon-python/pypaimon/write/writer/data_blob_writer.py +++ b/paimon-python/pypaimon/write/writer/data_blob_writer.py @@ -307,6 +307,8 @@ def _write_normal_data_to_file(self, data: pa.Table) -> Optional[DataFileMeta]: file_name = f"{CoreOptions.data_file_prefix(self.options)}{uuid.uuid4()}-0.{self.file_format}" file_path = self._generate_file_path(file_name) + self._check_no_variant_for_format(data.schema) + # Write file based on format if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: self.file_io.write_parquet(file_path, data, compression=self.compression, zstd_level=self.zstd_level) diff --git a/paimon-python/pypaimon/write/writer/data_writer.py b/paimon-python/pypaimon/write/writer/data_writer.py index 725a1fb230de..4ae5ef739a60 100644 --- a/paimon-python/pypaimon/write/writer/data_writer.py +++ b/paimon-python/pypaimon/write/writer/data_writer.py @@ -26,7 +26,7 @@ from pypaimon.data.timestamp import Timestamp from pypaimon.manifest.schema.data_file_meta import DataFileMeta from pypaimon.manifest.schema.simple_stats import SimpleStats -from pypaimon.schema.data_types import PyarrowFieldParser +from pypaimon.schema.data_types import PyarrowFieldParser, is_variant_struct from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.row.generic_row import GenericRow @@ -70,6 +70,19 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op ) # Store the current generated external path to preserve scheme in metadata self._current_external_path: Optional[str] = None + self._variant_format_checked: bool = False + + # Variant shredding (static mode) — col_name → (obj_fields, target_arrow_type) + self._variant_shredding: Dict[str, Tuple] = {} + if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: + shredding_json = self.options.variant_shredding_schema() + if shredding_json: + from pypaimon.data.variant_shredding import ( + parse_shredding_schema_option, shredding_schema_to_arrow_type) + col_schemas = parse_shredding_schema_option(shredding_json) + for col_name, obj_fields in col_schemas.items(): + target_type = shredding_schema_to_arrow_type(obj_fields) + self._variant_shredding[col_name] = (obj_fields, target_type) def write(self, data: pa.RecordBatch): try: @@ -154,6 +167,22 @@ def _check_and_roll_if_needed(self): self._write_data_to_file(data_to_write) self.pending_data = remaining_data + def _check_no_variant_for_format(self, schema: pa.Schema): + """Raise NotImplementedError if any VARIANT column is present for an unsupported format. + + The check is performed only once per writer instance; subsequent calls are no-ops + because the schema and file format are both fixed for the lifetime of the writer. + """ + if self._variant_format_checked: + return + self._variant_format_checked = True + if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): + for field in schema: + if pa.types.is_struct(field.type) and is_variant_struct(field.type): + raise NotImplementedError( + f"VARIANT type is not supported for {self.file_format} format" + ) + def _write_data_to_file(self, data: pa.Table): if data.num_rows == 0: return @@ -167,6 +196,11 @@ def _write_data_to_file(self, data: pa.Table): else: external_path_str = None + self._check_no_variant_for_format(data.schema) + + if self._variant_shredding: + data = self._apply_variant_shredding(data) + if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: self.file_io.write_parquet(file_path, data, compression=self.compression, zstd_level=self.zstd_level) elif self.file_format == CoreOptions.FILE_FORMAT_ORC: @@ -237,6 +271,37 @@ def _write_data_to_file(self, data: pa.Table): file_path=file_path, )) + def _apply_variant_shredding(self, data: pa.Table) -> pa.Table: + """Transform VARIANT columns into shredded Parquet format. + + Each shredded parent column is tagged with a ``PARQUET:field_id`` so that + the Java ``ParquetSchemaConverter.convertToPaimonField`` (called from + ``VariantUtils.variantFileType``) can read ``parquetType.getId().intValue()`` + without a NullPointerException. + """ + from pypaimon.data.variant_shredding import shred_variant_column + columns = list(data.columns) + fields = list(data.schema) + changed = False + + paimon_field_id: Dict[str, int] = {} + for pf in self.table.fields: + paimon_field_id[pf.name] = pf.id + + for i, f in enumerate(fields): + if f.name in self._variant_shredding: + obj_fields, target_type = self._variant_shredding[f.name] + columns[i] = shred_variant_column(columns[i], obj_fields, target_type) + pid = paimon_field_id.get(f.name) + parent_meta = {b'PARQUET:field_id': str(pid).encode()} if pid is not None else None + fields[i] = pa.field(f.name, target_type, nullable=f.nullable, metadata=parent_meta) + changed = True + if not changed: + return data + if isinstance(data, pa.Table): + return pa.Table.from_arrays(columns, schema=pa.schema(fields)) + return pa.RecordBatch.from_arrays(columns, schema=pa.schema(fields)) + def _generate_file_path(self, file_name: str) -> str: if self.external_path_provider: external_path = self.external_path_provider.get_next_external_data_path(file_name) From 86e0cc6a243913030a57f1a7db5e89648be46d51 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 14:44:38 +0800 Subject: [PATCH 2/7] support VARIANT for pypaimon --- .../pypaimon/data/generic_variant.py | 108 ++---------------- .../pypaimon/data/variant_shredding.py | 26 +++-- .../read/reader/format_pyarrow_reader.py | 55 +++------ paimon-python/pypaimon/schema/data_types.py | 23 ++-- paimon-python/pypaimon/tests/variant_test.py | 8 +- .../pypaimon/write/writer/data_blob_writer.py | 2 - .../pypaimon/write/writer/data_writer.py | 36 ++---- 7 files changed, 56 insertions(+), 202 deletions(-) diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py index 08ba51def448..b32e091f0dfb 100644 --- a/paimon-python/pypaimon/data/generic_variant.py +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -39,18 +39,17 @@ GenericVariant.to_arrow_array(variants)– convert a list to a PyArrow StructArray Inspection helpers (for debugging/testing): - v.to_json() – decode back to a JSON string v.to_python() – decode to native Python objects v.value() – raw value bytes v.metadata() – raw metadata bytes """ -import base64 import datetime import decimal as _decimal import enum import json as _json import struct +import uuid as _uuid # --------------------------------------------------------------------------- # Constants (matching GenericVariantUtil.java) @@ -92,7 +91,7 @@ _MAX_DECIMAL8_PRECISION = 18 _MAX_DECIMAL16_PRECISION = 38 -# Epoch for date/timestamp conversions (used by to_json / to_python) +# Epoch for date/timestamp conversions (used by to_python) _EPOCH_DATE = datetime.date(1970, 1, 1) _EPOCH_DT_UTC = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) _EPOCH_DT_NTZ = datetime.datetime(1970, 1, 1) @@ -566,7 +565,6 @@ class GenericVariant: for row in result.column('payload').to_pylist(): if row is not None: gv = GenericVariant.from_arrow_struct(row) - print(gv.to_json()) # e.g. '{"age":30,"city":"Beijing"}' print(gv.to_python()) # e.g. {'age': 30, 'city': 'Beijing'} """ @@ -617,7 +615,7 @@ def from_arrow_struct(cls, d: dict) -> 'GenericVariant': for row in result.column("payload").to_pylist(): if row is not None: gv = GenericVariant.from_arrow_struct(row) - print(gv.to_json()) + print(gv.to_python()) """ return cls(bytes(d['value']), bytes(d['metadata'])) @@ -679,97 +677,6 @@ def metadata(self) -> bytes: # -- inspection helpers (for debugging / testing) -- - def to_json(self) -> str: - """Decode the variant to a JSON string. - - Useful for debugging and testing. Variant semantics and path-based - queries are the responsibility of the application layer. - """ - parts = [] - self._to_json_impl(self._value, self._metadata, self._pos, parts) - return ''.join(parts) - - def _to_json_impl(self, value, metadata, pos, parts): - vtype = _variant_get_type(value, pos) - if vtype == _Type.OBJECT: - def _render(size, id_size, offset_size, id_start, offset_start, data_start): - parts.append('{') - for i in range(size): - fid = _read_unsigned(value, id_start + id_size * i, id_size) - key = _get_metadata_key(metadata, fid) - offset = _read_unsigned( - value, offset_start + offset_size * i, offset_size) - if i != 0: - parts.append(',') - parts.append(_json.dumps(key)) - parts.append(':') - self._to_json_impl(value, metadata, data_start + offset, parts) - parts.append('}') - _handle_object(value, pos, _render) - elif vtype == _Type.ARRAY: - def _render_arr(size, offset_size, offset_start, data_start): - parts.append('[') - for i in range(size): - offset = _read_unsigned( - value, offset_start + offset_size * i, offset_size) - if i != 0: - parts.append(',') - self._to_json_impl(value, metadata, data_start + offset, parts) - parts.append(']') - _handle_array(value, pos, _render_arr) - else: - b = value[pos] - basic_type = b & 0x3 - type_info = (b >> 2) & 0x3F - if vtype == _Type.NULL: - parts.append('null') - elif vtype == _Type.BOOLEAN: - parts.append('true' if type_info == _TRUE else 'false') - elif vtype == _Type.LONG: - n = _LONG_FAMILY_SIZES.get(type_info) - parts.append(str(_read_signed(value, pos + 1, n))) - elif vtype == _Type.STRING: - if basic_type == _SHORT_STR: - s = value[pos + 1:pos + 1 + type_info].decode('utf-8') - else: - length = _read_unsigned(value, pos + 1, _U32_SIZE) - s = value[pos + 1 + _U32_SIZE:pos + 1 + _U32_SIZE + length].decode('utf-8') - parts.append(_json.dumps(s)) - elif vtype == _Type.DOUBLE: - d = struct.unpack_from(' str: - return f'GenericVariant({self.to_json()!r})' + return f'GenericVariant({self.to_python()!r})' def __str__(self) -> str: - return self.to_json() + return str(self.to_python()) def __eq__(self, other) -> bool: if not isinstance(other, GenericVariant): diff --git a/paimon-python/pypaimon/data/variant_shredding.py b/paimon-python/pypaimon/data/variant_shredding.py index 04502eebe13e..390e1b81c399 100644 --- a/paimon-python/pypaimon/data/variant_shredding.py +++ b/paimon-python/pypaimon/data/variant_shredding.py @@ -420,13 +420,19 @@ def _extract_overflow_fields(overflow_bytes: bytes) -> List[Tuple[int, bytes]]: sentinel = _read_unsigned(overflow_bytes, offset_start + size * offset_size, offset_size) # Sort by offset so that adjacent entries define contiguous data boundaries. - sorted_pairs = sorted(pairs, key=lambda p: p[1]) - boundaries = [p[1] for p in sorted_pairs] + [sentinel] - offset_to_end = {boundaries[j]: boundaries[j + 1] for j in range(len(boundaries) - 1)} + # Track by original index (always unique) to avoid dict key collisions when + # two fields share the same offset (malformed data). + indexed_pairs = sorted(enumerate(pairs), key=lambda x: x[1][1]) + boundaries = [ip[1][1] for ip in indexed_pairs] + [sentinel] + + # end_by_orig[i] = end offset for pairs[i] + end_by_orig = [0] * size + for rank, (orig_idx, _) in enumerate(indexed_pairs): + end_by_orig[orig_idx] = boundaries[rank + 1] fields: List[Tuple[int, bytes]] = [] - for key_id, off in pairs: - end = offset_to_end[off] + for orig_idx, (key_id, off) in enumerate(pairs): + end = end_by_orig[orig_idx] field_bytes = bytes(overflow_bytes[data_start + off:data_start + end]) fields.append((key_id, field_bytes)) return fields @@ -593,8 +599,8 @@ def assemble_shredded_column(column: pa.Array, schema: VariantSchema) -> pa.Arra """ rows = column.to_pylist() assembled = [] - # all rows in a batch typically share the same metadata; cache the parsed dict - key_dict_cache: Optional[Dict[str, int]] = None + # cache parsed key dicts keyed by metadata bytes; most files share one metadata + key_dict_cache: Dict[bytes, Dict[str, int]] = {} for row in rows: if row is None: @@ -607,10 +613,10 @@ def assemble_shredded_column(column: pa.Array, schema: VariantSchema) -> pa.Arra continue metadata = bytes(raw_meta) - if key_dict_cache is None: - key_dict_cache = parse_metadata_dict(metadata) + if metadata not in key_dict_cache: + key_dict_cache[metadata] = parse_metadata_dict(metadata) - value_bytes = rebuild_value(row, schema, key_dict_cache) + value_bytes = rebuild_value(row, schema, key_dict_cache[metadata]) if value_bytes is None: value_bytes = _NULL_VALUE_BYTES diff --git a/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py b/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py index 6be1f8f485a7..a11aae9aca08 100644 --- a/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py +++ b/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py @@ -44,28 +44,11 @@ class FormatPyArrowReader(RecordBatchReader): reconstructs the standard ``struct`` representation. """ - def __init__( - self, - file_io: FileIO, - file_format: str, - file_path: str, - read_fields: List[DataField], - push_down_predicate: Any, - batch_size: int = 1024, - ): - """ - Args: - file_io: FileIO for the storage backend. - file_format: ``'parquet'`` or ``'orc'``. - file_path: Path to the data file. - read_fields: Fields to project (in order). - push_down_predicate: Optional Arrow expression predicate. - batch_size: Target rows per batch. - """ + def __init__(self, file_io: FileIO, file_format: str, file_path: str, + read_fields: List[DataField], + push_down_predicate: Any, batch_size: int = 1024): file_path_for_pyarrow = file_io.to_filesystem_path(file_path) - self.dataset = ds.dataset( - file_path_for_pyarrow, format=file_format, filesystem=file_io.filesystem - ) + self.dataset = ds.dataset(file_path_for_pyarrow, format=file_format, filesystem=file_io.filesystem) self._file_format = file_format self.read_fields = read_fields self._read_field_names = [f.name for f in read_fields] @@ -73,27 +56,21 @@ def __init__( # Identify which fields exist in the file and which are missing file_schema = self.dataset.schema file_schema_names = set(file_schema.names) - self.existing_fields = [ - f.name for f in read_fields if f.name in file_schema_names - ] - self.missing_fields = [ - f.name for f in read_fields if f.name not in file_schema_names - ] + self.existing_fields = [f.name for f in read_fields if f.name in file_schema_names] + self.missing_fields = [f.name for f in read_fields if f.name not in file_schema_names] # column name → VariantSchema for shredded columns that need assembly self._shredded_schemas: Dict[str, VariantSchema] = {} for name in self.existing_fields: - try: - field_type = file_schema.field(name).type - except KeyError: - continue + field_type = file_schema.field(name).type if is_shredded_variant(field_type): self._shredded_schemas[name] = build_variant_schema(field_type) + # Only pass existing fields to PyArrow scanner to avoid errors self.reader = self.dataset.scanner( columns=self.existing_fields, filter=push_down_predicate, - batch_size=batch_size, + batch_size=batch_size ).to_reader() self._output_schema = ( @@ -104,7 +81,7 @@ def read_arrow_batch(self) -> Optional[RecordBatch]: try: batch = self.reader.read_next_batch() - if self._file_format == "orc" and self._output_schema is not None: + if self._file_format == 'orc' and self._output_schema is not None: batch = self._cast_orc_time_columns(batch) if self._shredded_schemas: @@ -141,7 +118,7 @@ def _type_for_missing(name: str) -> pa.DataType: all_columns.append(missing_columns[column_idx]) nullable = not SpecialFields.is_system_field(field_name) out_fields.append(pa.field(field_name, col_type, nullable=nullable)) - + # Create a new RecordBatch with all columns return pa.RecordBatch.from_arrays(all_columns, schema=pa.schema(out_fields)) except StopIteration: @@ -166,18 +143,16 @@ def _assemble_shredded_variants(self, batch: pa.RecordBatch) -> pa.RecordBatch: return pa.RecordBatch.from_arrays(columns, schema=pa.schema(fields)) def _cast_orc_time_columns(self, batch): - """Cast int32 TIME columns back to time32('ms') when reading ORC.""" + """Cast int32 TIME columns back to time32('ms') when reading ORC. + """ columns = [] fields = [] changed = False for i, name in enumerate(batch.schema.names): col = batch.column(i) idx = self._output_schema.get_field_index(name) - if ( - idx >= 0 - and pa.types.is_int32(col.type) - and pa.types.is_time(self._output_schema.field(idx).type) - ): + if idx >= 0 and pa.types.is_int32(col.type) \ + and pa.types.is_time(self._output_schema.field(idx).type): col = col.cast(self._output_schema.field(idx).type) fields.append(self._output_schema.field(idx)) changed = True diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index d2271d9f0aff..27c1f7d145c7 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -455,24 +455,15 @@ def parse_data_field( def is_variant_struct(pa_type: pyarrow.StructType) -> bool: - """Return True if *pa_type* is the two-field BINARY struct used to encode VARIANT. - - Paimon Java stores VARIANT as a Parquet GROUP with exactly two non-nullable - BINARY primitives: ``value`` (field index 0) and ``metadata`` (field index 1). - PyArrow surfaces this group as a struct type; we fingerprint it here so that - :meth:`PyarrowFieldParser.to_paimon_type` can round-trip it back to VARIANT - instead of misclassifying it as a generic ROW type. - - This heuristic is fragile by necessity — Arrow has no native Variant type yet. - It will not mis-fire on ordinary ROW fields as long as callers do not name two - non-nullable binary columns ``value`` / ``metadata`` at the same nesting level. - """ + """Return True if *pa_type* is the ``struct`` encoding of VARIANT.""" if pa_type.num_fields != 2: return False - f0, f1 = pa_type[0], pa_type[1] - return ( - f0.name == 'value' and pyarrow.types.is_binary(f0.type) and not f0.nullable - and f1.name == 'metadata' and pyarrow.types.is_binary(f1.type) and not f1.nullable + names = {pa_type.field(i).name for i in range(pa_type.num_fields)} + if names != {'value', 'metadata'}: + return False + return all( + pyarrow.types.is_binary(pa_type.field(n).type) and not pa_type.field(n).nullable + for n in ('value', 'metadata') ) diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index 63a81deb9e7d..86c89d9b5c67 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -322,12 +322,6 @@ def test_from_arrow_struct_roundtrip(self): self.assertEqual(restored.value(), original.value()) self.assertEqual(restored.metadata(), original.metadata()) - def test_to_json_roundtrip(self): - json_str = '{"age":30,"city":"Beijing"}' - gv = GenericVariant.from_json(json_str) - parsed = json.loads(gv.to_json()) - self.assertEqual(parsed, json.loads(json_str)) - def test_to_python_object(self): gv = GenericVariant.from_json('{"age":30,"city":"Beijing"}') result = gv.to_python() @@ -402,7 +396,7 @@ class TestJsonRoundtrip(unittest.TestCase): def _check(self, json_str): gv = GenericVariant.from_json(json_str) - self.assertEqual(json.loads(gv.to_json()), json.loads(json_str)) + self.assertEqual(gv.to_python(), json.loads(json_str)) def test_nested_object_array(self): self._check('{"users":[{"name":"Alice","age":30},{"name":"Bob","age":25}]}') diff --git a/paimon-python/pypaimon/write/writer/data_blob_writer.py b/paimon-python/pypaimon/write/writer/data_blob_writer.py index d170913bd917..62cbd013ece5 100644 --- a/paimon-python/pypaimon/write/writer/data_blob_writer.py +++ b/paimon-python/pypaimon/write/writer/data_blob_writer.py @@ -307,8 +307,6 @@ def _write_normal_data_to_file(self, data: pa.Table) -> Optional[DataFileMeta]: file_name = f"{CoreOptions.data_file_prefix(self.options)}{uuid.uuid4()}-0.{self.file_format}" file_path = self._generate_file_path(file_name) - self._check_no_variant_for_format(data.schema) - # Write file based on format if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: self.file_io.write_parquet(file_path, data, compression=self.compression, zstd_level=self.zstd_level) diff --git a/paimon-python/pypaimon/write/writer/data_writer.py b/paimon-python/pypaimon/write/writer/data_writer.py index 4ae5ef739a60..84b9a12dc79b 100644 --- a/paimon-python/pypaimon/write/writer/data_writer.py +++ b/paimon-python/pypaimon/write/writer/data_writer.py @@ -26,7 +26,7 @@ from pypaimon.data.timestamp import Timestamp from pypaimon.manifest.schema.data_file_meta import DataFileMeta from pypaimon.manifest.schema.simple_stats import SimpleStats -from pypaimon.schema.data_types import PyarrowFieldParser, is_variant_struct +from pypaimon.schema.data_types import PyarrowFieldParser from pypaimon.table.bucket_mode import BucketMode from pypaimon.table.row.generic_row import GenericRow @@ -70,8 +70,6 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op ) # Store the current generated external path to preserve scheme in metadata self._current_external_path: Optional[str] = None - self._variant_format_checked: bool = False - # Variant shredding (static mode) — col_name → (obj_fields, target_arrow_type) self._variant_shredding: Dict[str, Tuple] = {} if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: @@ -84,6 +82,10 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op target_type = shredding_schema_to_arrow_type(obj_fields) self._variant_shredding[col_name] = (obj_fields, target_type) + # Paimon field id map, used by _apply_variant_shredding; built once since + # the table schema is fixed for the lifetime of this writer. + self._paimon_field_id: Dict[str, int] = {pf.name: pf.id for pf in self.table.fields} + def write(self, data: pa.RecordBatch): try: processed_data = self._process_data(data) @@ -167,22 +169,6 @@ def _check_and_roll_if_needed(self): self._write_data_to_file(data_to_write) self.pending_data = remaining_data - def _check_no_variant_for_format(self, schema: pa.Schema): - """Raise NotImplementedError if any VARIANT column is present for an unsupported format. - - The check is performed only once per writer instance; subsequent calls are no-ops - because the schema and file format are both fixed for the lifetime of the writer. - """ - if self._variant_format_checked: - return - self._variant_format_checked = True - if self.file_format in (CoreOptions.FILE_FORMAT_ORC, CoreOptions.FILE_FORMAT_AVRO): - for field in schema: - if pa.types.is_struct(field.type) and is_variant_struct(field.type): - raise NotImplementedError( - f"VARIANT type is not supported for {self.file_format} format" - ) - def _write_data_to_file(self, data: pa.Table): if data.num_rows == 0: return @@ -196,8 +182,6 @@ def _write_data_to_file(self, data: pa.Table): else: external_path_str = None - self._check_no_variant_for_format(data.schema) - if self._variant_shredding: data = self._apply_variant_shredding(data) @@ -284,23 +268,17 @@ def _apply_variant_shredding(self, data: pa.Table) -> pa.Table: fields = list(data.schema) changed = False - paimon_field_id: Dict[str, int] = {} - for pf in self.table.fields: - paimon_field_id[pf.name] = pf.id - for i, f in enumerate(fields): if f.name in self._variant_shredding: obj_fields, target_type = self._variant_shredding[f.name] columns[i] = shred_variant_column(columns[i], obj_fields, target_type) - pid = paimon_field_id.get(f.name) + pid = self._paimon_field_id.get(f.name) parent_meta = {b'PARQUET:field_id': str(pid).encode()} if pid is not None else None fields[i] = pa.field(f.name, target_type, nullable=f.nullable, metadata=parent_meta) changed = True if not changed: return data - if isinstance(data, pa.Table): - return pa.Table.from_arrays(columns, schema=pa.schema(fields)) - return pa.RecordBatch.from_arrays(columns, schema=pa.schema(fields)) + return pa.Table.from_arrays(columns, schema=pa.schema(fields)) def _generate_file_path(self, file_name: str) -> str: if self.external_path_provider: From ce14cd39f4b2ee3af29fc09f238eec3ed7c781cb Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 15:03:46 +0800 Subject: [PATCH 3/7] support VARIANT for pypaimon --- docs/content/pypaimon/python-api.md | 11 ++-- .../pypaimon/common/options/core_options.py | 15 ++++++ .../pypaimon/data/generic_variant.py | 26 ++-------- .../read/reader/format_pyarrow_reader.py | 13 +++-- paimon-python/pypaimon/read/split_read.py | 3 +- paimon-python/pypaimon/tests/variant_test.py | 52 ++++++++----------- .../pypaimon/write/writer/data_writer.py | 3 +- 7 files changed, 58 insertions(+), 65 deletions(-) diff --git a/docs/content/pypaimon/python-api.md b/docs/content/pypaimon/python-api.md index 7a73b317dd8c..8bfcfdae8825 100644 --- a/docs/content/pypaimon/python-api.md +++ b/docs/content/pypaimon/python-api.md @@ -722,7 +722,7 @@ Row kind values: in the [Parquet Variant binary encoding](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md). pypaimon exposes VARIANT columns as Arrow `struct` and -provides `GenericVariant` for encoding, decoding, and path extraction. +provides `GenericVariant` for encoding and decoding. Paimon supports two Parquet storage layouts for VARIANT: @@ -748,11 +748,10 @@ for record in result.to_pylist(): if (payload := record["payload"]) is not None: gv = GenericVariant.from_arrow_struct(payload) print(gv.to_python()) # decode to Python dict / list / scalar - print(gv.to_json()) # decode to JSON string ``` `from_arrow_struct` is a lightweight operation — it only wraps the two raw byte arrays without -parsing them. Actual variant binary decoding is deferred to `to_python()` / `to_json()`. +parsing them. Actual variant binary decoding is deferred to `to_python()`. **Write** @@ -762,7 +761,7 @@ Build `GenericVariant` values and convert them to an Arrow column with `to_arrow import pyarrow as pa from pypaimon.data.generic_variant import GenericVariant -gv1 = GenericVariant.from_json('{"city": "Beijing", "age": 30}') +gv1 = GenericVariant.from_python({'city': 'Beijing', 'age': 30}) gv2 = GenericVariant.from_python({'tags': [1, 2, 3], 'active': True}) # None represents SQL NULL @@ -876,18 +875,16 @@ Supported Paimon type strings for shredded sub-fields: `BOOLEAN`, `INT`, `BIGINT | Method | Description | |:-------|:------------| -| `GenericVariant.from_json(json_str)` | Build from a JSON string | | `GenericVariant.from_python(obj)` | Build from a Python object (`dict`, `list`, `int`, `str`, …) | | `GenericVariant.from_arrow_struct({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row (read path) | | `GenericVariant.to_arrow_array([gv1, gv2, None, ...])` | Convert a list of `GenericVariant` (or `None`) to a `pa.StructArray` for writing | | `gv.to_python()` | Decode to native Python (`dict`, `list`, `int`, `str`, `None`, …) | -| `gv.to_json()` | Decode to a JSON string | | `gv.value()` | Return raw value bytes | | `gv.metadata()` | Return raw metadata bytes | **Limitations:** -- `VARIANT` is only supported with Parquet file format. Writing to ORC or Avro raises `NotImplementedError`. +- `VARIANT` is only supported with Parquet file format. ORC and Avro are not supported. - `VARIANT` cannot be used as a primary key or partition key. ## Predicate diff --git a/paimon-python/pypaimon/common/options/core_options.py b/paimon-python/pypaimon/common/options/core_options.py index fe3ba668abc0..6a4b51c8c33a 100644 --- a/paimon-python/pypaimon/common/options/core_options.py +++ b/paimon-python/pypaimon/common/options/core_options.py @@ -405,6 +405,18 @@ class CoreOptions: ) ) + VARIANT_SHREDDING_ENABLED: ConfigOption[bool] = ( + ConfigOptions.key("variant.shredding.enabled") + .boolean_type() + .default_value(True) + .with_description( + "Whether to enable VARIANT shredding. When True (default), writes apply the " + "shredding schema configured via 'variant.shreddingSchema', and reads " + "automatically reassemble shredded columns back to the standard " + "struct form. Set to False to bypass both behaviours." + ) + ) + VARIANT_SHREDDING_SCHEMA: ConfigOption[str] = ( ConfigOptions.key("variant.shreddingSchema") .string_type() @@ -495,6 +507,9 @@ def metadata_stats_enabled(self, default=None): def blob_as_descriptor(self, default=None): return self.options.get(CoreOptions.BLOB_AS_DESCRIPTOR, default) + def variant_shredding_enabled(self) -> bool: + return self.options.get(CoreOptions.VARIANT_SHREDDING_ENABLED, True) + def variant_shredding_schema(self) -> Optional[str]: val = self.options.get(CoreOptions.VARIANT_SHREDDING_SCHEMA) if val is None: diff --git a/paimon-python/pypaimon/data/generic_variant.py b/paimon-python/pypaimon/data/generic_variant.py index b32e091f0dfb..a44143a981dd 100644 --- a/paimon-python/pypaimon/data/generic_variant.py +++ b/paimon-python/pypaimon/data/generic_variant.py @@ -32,7 +32,6 @@ pypaimon (the storage layer). Primary entry points: - GenericVariant.from_json(json_str) – build from a JSON string (for writing) GenericVariant.from_python(obj) – build from a Python object (for writing) GenericVariant(value, metadata) – wrap raw bytes read from a VARIANT column GenericVariant.from_arrow_struct(d) – wrap a row dict from a VARIANT Arrow column @@ -47,7 +46,6 @@ import datetime import decimal as _decimal import enum -import json as _json import struct import uuid as _uuid @@ -278,7 +276,7 @@ def _get_metadata_key(metadata, key_id): # --------------------------------------------------------------------------- -# _GenericVariantBuilder (for from_json / from_python) +# _GenericVariantBuilder (for from_python) # --------------------------------------------------------------------------- class _GenericVariantBuilder: @@ -552,8 +550,8 @@ class GenericVariant: import pyarrow as pa from pypaimon.data.generic_variant import GenericVariant - gv1 = GenericVariant.from_json('{"age": 30, "city": "Beijing"}') - gv2 = GenericVariant.from_json('[1, 2, 3]') + gv1 = GenericVariant.from_python({'age': 30, 'city': 'Beijing'}) + gv2 = GenericVariant.from_python([1, 2, 3]) # Create an Arrow StructArray ready for writing col = GenericVariant.to_arrow_array([gv1, gv2, None]) table = pa.table({'id': [1, 2, 3], 'payload': col}) @@ -579,20 +577,6 @@ def __init__(self, value: bytes, metadata: bytes, _pos: int = 0): # -- constructors -- - @classmethod - def from_json(cls, json_str: str) -> 'GenericVariant': - """Parse a JSON string and encode it as VARIANT binary bytes. - - Use this when writing VARIANT data from Python:: - - gv = GenericVariant.from_json('{"name": "Alice", "age": 30}') - col = GenericVariant.to_arrow_array([gv]) - """ - obj = _json.loads(json_str, parse_float=_decimal.Decimal) - builder = _GenericVariantBuilder() - builder.build_python(obj) - return builder.result() - @classmethod def from_python(cls, obj) -> 'GenericVariant': """Encode a Python object (dict / list / str / int / float / bool / None) as VARIANT. @@ -631,8 +615,8 @@ def to_arrow_array(cls, variants): Example:: - gv1 = GenericVariant.from_json('{"age":30}') - gv2 = GenericVariant.from_json('[1,2,3]') + gv1 = GenericVariant.from_python({'age': 30}) + gv2 = GenericVariant.from_python([1, 2, 3]) col = GenericVariant.to_arrow_array([gv1, gv2, None]) table = pa.table({'id': [1, 2, 3], 'payload': col}) """ diff --git a/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py b/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py index a11aae9aca08..ddfb368c8899 100644 --- a/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py +++ b/paimon-python/pypaimon/read/reader/format_pyarrow_reader.py @@ -23,6 +23,7 @@ from pyarrow import RecordBatch from pypaimon.common.file_io import FileIO +from pypaimon.common.options.core_options import CoreOptions from pypaimon.data.variant_shredding import ( VariantSchema, assemble_shredded_column, @@ -46,7 +47,8 @@ class FormatPyArrowReader(RecordBatchReader): def __init__(self, file_io: FileIO, file_format: str, file_path: str, read_fields: List[DataField], - push_down_predicate: Any, batch_size: int = 1024): + push_down_predicate: Any, batch_size: int = 1024, + options: CoreOptions = None): file_path_for_pyarrow = file_io.to_filesystem_path(file_path) self.dataset = ds.dataset(file_path_for_pyarrow, format=file_format, filesystem=file_io.filesystem) self._file_format = file_format @@ -61,10 +63,11 @@ def __init__(self, file_io: FileIO, file_format: str, file_path: str, # column name → VariantSchema for shredded columns that need assembly self._shredded_schemas: Dict[str, VariantSchema] = {} - for name in self.existing_fields: - field_type = file_schema.field(name).type - if is_shredded_variant(field_type): - self._shredded_schemas[name] = build_variant_schema(field_type) + if options is None or options.variant_shredding_enabled(): + for name in self.existing_fields: + field_type = file_schema.field(name).type + if is_shredded_variant(field_type): + self._shredded_schemas[name] = build_variant_schema(field_type) # Only pass existing fields to PyArrow scanner to avoid errors self.reader = self.dataset.scanner( diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py index 920ff423172d..1e8c00fc7ebb 100644 --- a/paimon-python/pypaimon/read/split_read.py +++ b/paimon-python/pypaimon/read/split_read.py @@ -178,7 +178,8 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool, ordered_read_fields = [name_to_field[n] for n in read_file_fields if n in name_to_field] format_reader = FormatPyArrowReader( self.table.file_io, file_format, file_path, - ordered_read_fields, read_arrow_predicate, batch_size=batch_size) + ordered_read_fields, read_arrow_predicate, batch_size=batch_size, + options=self.table.options) elif file_format in ('json', 'csv'): raise NotImplementedError( f"Reading '{file_format}' format is not yet supported in Python SDK. " diff --git a/paimon-python/pypaimon/tests/variant_test.py b/paimon-python/pypaimon/tests/variant_test.py index 86c89d9b5c67..4307c8ecffe4 100644 --- a/paimon-python/pypaimon/tests/variant_test.py +++ b/paimon-python/pypaimon/tests/variant_test.py @@ -302,45 +302,37 @@ def test_mixed_schema_round_trip(self): class TestGenericVariantContainer(unittest.TestCase): - def test_from_json_returns_instance(self): - gv = GenericVariant.from_json('{"age":30}') - self.assertIsInstance(gv, GenericVariant) - self.assertIsInstance(gv.value(), bytes) - self.assertIsInstance(gv.metadata(), bytes) - self.assertGreater(len(gv.value()), 0) - self.assertGreater(len(gv.metadata()), 0) - def test_from_python_returns_instance(self): gv = GenericVariant.from_python({'a': 1, 'b': 'hello'}) self.assertIsInstance(gv, GenericVariant) self.assertIsInstance(gv.value(), bytes) def test_from_arrow_struct_roundtrip(self): - original = GenericVariant.from_json('{"x":1,"y":2}') + original = GenericVariant.from_python({'x': 1, 'y': 2}) restored = GenericVariant.from_arrow_struct( {'value': original.value(), 'metadata': original.metadata()}) self.assertEqual(restored.value(), original.value()) self.assertEqual(restored.metadata(), original.metadata()) def test_to_python_object(self): - gv = GenericVariant.from_json('{"age":30,"city":"Beijing"}') + gv = GenericVariant.from_python({'age': 30, 'city': 'Beijing'}) result = gv.to_python() self.assertEqual(result, {'age': 30, 'city': 'Beijing'}) def test_to_python_array(self): - gv = GenericVariant.from_json('[1,2,3]') + gv = GenericVariant.from_python([1, 2, 3]) self.assertEqual(gv.to_python(), [1, 2, 3]) def test_to_python_null(self): - gv = GenericVariant.from_json('null') + gv = GenericVariant.from_python(None) self.assertIsNone(gv.to_python()) def test_to_python_string(self): - gv = GenericVariant.from_json('"hello"') + gv = GenericVariant.from_python('hello') self.assertEqual(gv.to_python(), 'hello') def test_to_python_number(self): - gv = GenericVariant.from_json('42') + gv = GenericVariant.from_python(42) self.assertEqual(gv.to_python(), 42) def test_from_python_none(self): @@ -356,12 +348,12 @@ def test_from_python_nested(self): self.assertTrue(result['active']) def test_equality(self): - gv1 = GenericVariant.from_json('{"a":1}') - gv2 = GenericVariant.from_json('{"a":1}') + gv1 = GenericVariant.from_python({'a': 1}) + gv2 = GenericVariant.from_python({'a': 1}) self.assertEqual(gv1, gv2) def test_repr_and_str(self): - gv = GenericVariant.from_json('"hello"') + gv = GenericVariant.from_python('hello') self.assertIn('hello', repr(gv)) self.assertIn('hello', str(gv)) @@ -369,8 +361,8 @@ def test_repr_and_str(self): class TestToArrowArray(unittest.TestCase): def test_basic(self): - gv1 = GenericVariant.from_json('{"a":1}') - gv2 = GenericVariant.from_json('[1,2]') + gv1 = GenericVariant.from_python({'a': 1}) + gv2 = GenericVariant.from_python([1, 2]) arr = GenericVariant.to_arrow_array([gv1, gv2]) self.assertIsInstance(arr, pa.StructArray) self.assertEqual(len(arr), 2) @@ -378,7 +370,7 @@ def test_basic(self): self.assertEqual(restored.to_python(), {'a': 1}) def test_with_nulls(self): - arr = GenericVariant.to_arrow_array([GenericVariant.from_json('42'), None]) + arr = GenericVariant.to_arrow_array([GenericVariant.from_python(42), None]) self.assertEqual(len(arr), 2) self.assertIsNotNone(arr[0].as_py()) self.assertIsNone(arr[1].as_py()) @@ -387,7 +379,7 @@ def test_empty(self): self.assertEqual(len(GenericVariant.to_arrow_array([])), 0) def test_arrow_type(self): - gv = GenericVariant.from_json('true') + gv = GenericVariant.from_python(True) arr = GenericVariant.to_arrow_array([gv]) self.assertEqual(arr.type, _variant_arrow_type()) @@ -395,7 +387,7 @@ def test_arrow_type(self): class TestJsonRoundtrip(unittest.TestCase): def _check(self, json_str): - gv = GenericVariant.from_json(json_str) + gv = GenericVariant.from_python(json.loads(json_str)) self.assertEqual(gv.to_python(), json.loads(json_str)) def test_nested_object_array(self): @@ -411,11 +403,11 @@ def test_all_primitive_types(self): self._check('{"n":null,"b":true,"i":42,"s":"hello","f":1.5}') def test_empty_object(self): - gv = GenericVariant.from_json('{}') + gv = GenericVariant.from_python({}) self.assertEqual(gv.to_python(), {}) def test_empty_array(self): - gv = GenericVariant.from_json('[]') + gv = GenericVariant.from_python([]) self.assertEqual(gv.to_python(), []) @@ -599,7 +591,7 @@ class TestEncodeScalar(unittest.TestCase): def _roundtrip(self, json_str: str, arrow_type: pa.DataType): """Encode a scalar to bytes via the Arrow type, then decode via GenericVariant.""" - gv_orig = GenericVariant.from_json(json_str) + gv_orig = GenericVariant.from_python(json.loads(json_str)) typed_value = pa.array([gv_orig.to_python()], type=arrow_type).to_pylist()[0] value_bytes = _encode_scalar_to_value_bytes(typed_value, arrow_type) gv = GenericVariant(value_bytes, b'\x01\x00') @@ -741,7 +733,7 @@ def test_object_absent_field_skipped(self): def test_object_with_overflow(self): """Fields not in typed_value are preserved from overflow bytes.""" - original = GenericVariant.from_json('{"age": 30, "extra": "overflow_val"}') + original = GenericVariant.from_python({'age': 30, 'extra': 'overflow_val'}) overflow_bytes = original.value() meta = original.metadata() key_dict = parse_metadata_dict(meta) @@ -763,7 +755,7 @@ def test_object_with_overflow(self): def test_typed_value_null_uses_overflow(self): """If typed_value struct is None for the whole row, full overflow bytes are used.""" - original = GenericVariant.from_json('{"age": 99}') + original = GenericVariant.from_python({'age': 99}) meta = original.metadata() key_dict = parse_metadata_dict(meta) @@ -779,9 +771,9 @@ def test_typed_value_null_uses_overflow(self): self.assertEqual(gv.to_python()['age'], 99) def test_rebuild_matches_direct_variant(self): - """Bytes rebuilt from shredded form must equal bytes from GenericVariant.from_json.""" + """Bytes rebuilt from shredded form must equal bytes from GenericVariant.from_python.""" original_json = '{"score": 42, "tag": "test"}' - original_gv = GenericVariant.from_json(original_json) + original_gv = GenericVariant.from_python(json.loads(original_json)) meta = original_gv.metadata() key_dict = parse_metadata_dict(meta) @@ -1063,7 +1055,7 @@ def test_plain_variant_write_and_read(self): gvs = [ GenericVariant.from_python({'age': 30, 'city': 'Beijing'}), GenericVariant.from_python({'score': 99, 'active': True}), - GenericVariant.from_json('[1, 2, 3]'), + GenericVariant.from_python([1, 2, 3]), ] data = pa.table( {'id': [1, 2, 3], 'payload': GenericVariant.to_arrow_array(gvs)}, diff --git a/paimon-python/pypaimon/write/writer/data_writer.py b/paimon-python/pypaimon/write/writer/data_writer.py index 84b9a12dc79b..f4802ae74e5e 100644 --- a/paimon-python/pypaimon/write/writer/data_writer.py +++ b/paimon-python/pypaimon/write/writer/data_writer.py @@ -72,7 +72,8 @@ def __init__(self, table, partition: Tuple, bucket: int, max_seq_number: int, op self._current_external_path: Optional[str] = None # Variant shredding (static mode) — col_name → (obj_fields, target_arrow_type) self._variant_shredding: Dict[str, Tuple] = {} - if self.file_format == CoreOptions.FILE_FORMAT_PARQUET: + if self.file_format == CoreOptions.FILE_FORMAT_PARQUET \ + and self.options.variant_shredding_enabled(): shredding_json = self.options.variant_shredding_schema() if shredding_json: from pypaimon.data.variant_shredding import ( From 3bf324baf93349a7c1a491b8974b087a91979b9f Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 15:07:07 +0800 Subject: [PATCH 4/7] support VARIANT for pypaimon --- paimon-python/pypaimon/data/variant_shredding.py | 2 -- paimon-python/pypaimon/schema/data_types.py | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paimon-python/pypaimon/data/variant_shredding.py b/paimon-python/pypaimon/data/variant_shredding.py index 390e1b81c399..cad1351157eb 100644 --- a/paimon-python/pypaimon/data/variant_shredding.py +++ b/paimon-python/pypaimon/data/variant_shredding.py @@ -47,8 +47,6 @@ the exact variant key names. """ -from __future__ import annotations - import datetime import decimal as _decimal from dataclasses import dataclass diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index 27c1f7d145c7..8608fa4ed6d3 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -455,7 +455,10 @@ def parse_data_field( def is_variant_struct(pa_type: pyarrow.StructType) -> bool: - """Return True if *pa_type* is the ``struct`` encoding of VARIANT.""" + """Return True if *pa_type* is the shredded VARIANT struct encoding. + + Matches ``struct``. + """ if pa_type.num_fields != 2: return False names = {pa_type.field(i).name for i in range(pa_type.num_fields)} From f8b700686dc54646737b7a6f4b96eed9de30a1c6 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 15:47:40 +0800 Subject: [PATCH 5/7] support VARIANT for pypaimon --- paimon-python/pypaimon/data/variant_shredding.py | 4 ++-- paimon-python/pypaimon/schema/data_types.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paimon-python/pypaimon/data/variant_shredding.py b/paimon-python/pypaimon/data/variant_shredding.py index cad1351157eb..a6ae77805131 100644 --- a/paimon-python/pypaimon/data/variant_shredding.py +++ b/paimon-python/pypaimon/data/variant_shredding.py @@ -192,7 +192,7 @@ def is_shredded_variant(pa_type: pa.DataType) -> bool: """ if not pa.types.is_struct(pa_type): return False - names = {pa_type.field(i).name for i in range(pa_type.num_fields)} + names = {pa_type[i].name for i in range(pa_type.num_fields)} return 'metadata' in names and 'value' in names and 'typed_value' in names @@ -205,7 +205,7 @@ def build_variant_schema(pa_type: pa.StructType) -> VariantSchema: """ schema = VariantSchema(num_fields=pa_type.num_fields) for i in range(pa_type.num_fields): - f = pa_type.field(i) + f = pa_type[i] if f.name == 'metadata': schema.metadata_idx = i elif f.name == 'value': diff --git a/paimon-python/pypaimon/schema/data_types.py b/paimon-python/pypaimon/schema/data_types.py index 8608fa4ed6d3..70639ba1a82d 100755 --- a/paimon-python/pypaimon/schema/data_types.py +++ b/paimon-python/pypaimon/schema/data_types.py @@ -461,11 +461,11 @@ def is_variant_struct(pa_type: pyarrow.StructType) -> bool: """ if pa_type.num_fields != 2: return False - names = {pa_type.field(i).name for i in range(pa_type.num_fields)} + names = {pa_type[i].name for i in range(pa_type.num_fields)} if names != {'value', 'metadata'}: return False return all( - pyarrow.types.is_binary(pa_type.field(n).type) and not pa_type.field(n).nullable + pyarrow.types.is_binary(pa_type[n].type) and not pa_type[n].nullable for n in ('value', 'metadata') ) From 3d6ca5b412e9d06c7911938999272c30a0f83c91 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 15:50:17 +0800 Subject: [PATCH 6/7] support VARIANT for pypaimon --- .../pypaimon/tests/e2e/java_py_read_write_test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py index 01baf771516b..941d312fe849 100644 --- a/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py +++ b/paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py @@ -797,9 +797,9 @@ def test_py_write_variant_table(self): table = self.catalog.get_table(table_name) variant_col = GenericVariant.to_arrow_array([ - GenericVariant.from_json('{"name":"test","value":42}'), - GenericVariant.from_json('[10,20,30]'), - GenericVariant.from_json('"hello"'), + GenericVariant.from_python({"name": "test", "value": 42}), + GenericVariant.from_python([10, 20, 30]), + GenericVariant.from_python("hello"), None, # SQL NULL at the column level, not a VARIANT containing JSON null ]) data = pa.table({ @@ -842,9 +842,9 @@ def test_py_write_variant_table(self): # Use data with age+city fields so the shredded sub-columns are exercised. # Row 3 is an array — it has no age/city, so it goes entirely to overflow. shredded_variant_col = GenericVariant.to_arrow_array([ - GenericVariant.from_json('{"age":30,"city":"Beijing"}'), - GenericVariant.from_json('{"age":25,"city":"Shanghai"}'), - GenericVariant.from_json('[1,2,3]'), + GenericVariant.from_python({"age": 30, "city": "Beijing"}), + GenericVariant.from_python({"age": 25, "city": "Shanghai"}), + GenericVariant.from_python([1, 2, 3]), ]) shredded_data = pa.table( { From 7184fcb232a3eb3e5c103e7a1d93361d1ec86a93 Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:31:43 +0800 Subject: [PATCH 7/7] support VARIANT for pypaimon --- paimon-python/dev/run_mixed_tests.sh | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/paimon-python/dev/run_mixed_tests.sh b/paimon-python/dev/run_mixed_tests.sh index 2067d1813eab..03e81f6e0d07 100755 --- a/paimon-python/dev/run_mixed_tests.sh +++ b/paimon-python/dev/run_mixed_tests.sh @@ -499,16 +499,21 @@ main() { echo "" - # Run VARIANT type test (Java write, Python read) - if ! run_java_variant_write_py_read_test; then - java_variant_write_py_read_result=1 - fi + # Run VARIANT type tests (requires Python >= 3.7) + if [[ "$PYTHON_MINOR" -ge 7 ]]; then + if ! run_java_variant_write_py_read_test; then + java_variant_write_py_read_result=1 + fi - echo "" + echo "" - # Run VARIANT Python-write Java-read test - if ! run_py_variant_write_java_read_test; then - py_variant_write_java_read_result=1 + if ! run_py_variant_write_java_read_test; then + py_variant_write_java_read_result=1 + fi + else + echo -e "${YELLOW}⏭ Skipping VARIANT Type Tests (requires Python >= 3.7, current: $PYTHON_VERSION)${NC}" + java_variant_write_py_read_result=0 + py_variant_write_java_read_result=0 fi echo ""