Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions .github/workflows/paimon-python-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ jobs:
else
python -m pip install --upgrade pip
pip install torch --index-url https://download.pytorch.org/whl/cpu
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6'
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6' pypaimon-rust==0.2.0
python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION }}' -i https://pypi.org/simple/
if python -c "import sys; sys.exit(0 if sys.version_info >= (3, 11) else 1)"; then
python -m pip install vortex-data==0.70.0
Expand All @@ -163,15 +163,6 @@ jobs:
maturin build --release
pip install target/wheels/tantivy-*.whl

- name: Build and install pypaimon-rust from source
if: matrix.python-version != '3.6.15'
shell: bash
run: |
git clone https://github.com/apache/paimon-rust.git /tmp/paimon-rust
cd /tmp/paimon-rust/bindings/python
maturin build --release -o dist
pip install dist/pypaimon_rust-*.whl
pip install 'datafusion>=52'

- name: Run lint-python.sh
shell: bash
Expand Down
21 changes: 10 additions & 11 deletions paimon-python/pypaimon/tests/blob_table_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def test_data_blob_writer_write_operations(self):
blob_writer.close()

def test_data_blob_writer_write_large_blob(self):
"""Test DataBlobWriter with very large blob data (50MB per item) in 10 batches."""
"""Test DataBlobWriter with large blob data (5MB per item) in 10 batches."""
from pypaimon import Schema

# Create schema with blob column
Expand All @@ -436,28 +436,27 @@ def test_data_blob_writer_write_large_blob(self):
write_builder = table.new_batch_write_builder()
blob_writer = write_builder.new_write()

# Create 50MB blob data per item
# Using a pattern to make the data more realistic and compressible
target_size = 50 * 1024 * 1024 # 50MB in bytes
# Create 5MB blob data per item
target_size = 5 * 1024 * 1024 # 5MB in bytes
blob_pattern = b'LARGE_BLOB_DATA_PATTERN_' + b'X' * 1024 # ~1KB pattern
pattern_size = len(blob_pattern)
repetitions = target_size // pattern_size
large_blob_data = blob_pattern * repetitions

# Verify the blob size is approximately 50MB
# Verify the blob size is approximately 5MB
blob_size_mb = len(large_blob_data) / (1024 * 1024)
self.assertGreater(blob_size_mb, 49) # Should be at least 49MB
self.assertLess(blob_size_mb, 51) # Should be less than 51MB
self.assertGreater(blob_size_mb, 4) # Should be at least 4MB
self.assertLess(blob_size_mb, 6) # Should be less than 6MB

total_rows = 0

# Write 10 batches, each with 5 rows (50 rows total)
# Total data volume: 50 rows * 50MB = 2.5GB of blob data
# Total data volume: 50 rows * 5MB = 250MB of blob data
for batch_num in range(10):
batch_data = pa.Table.from_pydict({
'id': [batch_num * 5 + i for i in range(5)],
'description': [f'Large blob batch {batch_num}, row {i}' for i in range(5)],
'large_blob': [large_blob_data] * 5 # 5 rows per batch, each with 50MB blob
'large_blob': [large_blob_data] * 5 # 5 rows per batch, each with 5MB blob
}, schema=pa_schema)

# Write each batch
Expand Down Expand Up @@ -502,9 +501,9 @@ def test_data_blob_writer_write_large_blob(self):
# Verify total data written (50 rows of normal data + 50 rows of blob data = 100 total)
self.assertEqual(total_row_count, 50)

# Verify total file size is substantial (should be much larger than 2.5GB due to overhead)
# Verify total file size is substantial (should be at least 200MB)
total_size_mb = total_file_size / (1024 * 1024)
self.assertGreater(total_size_mb, 2000) # Should be at least 2GB due to overhead
self.assertGreater(total_size_mb, 200)

total_files = sum(len(commit_msg.new_files) for commit_msg in commit_messages)
print(f"Total data written: {total_size_mb:.2f}MB across {total_files} files")
Expand Down
Loading