diff --git a/.github/workflows/paimon-python-checks.yml b/.github/workflows/paimon-python-checks.yml index de00337be359..5aaa2ab72ae6 100755 --- a/.github/workflows/paimon-python-checks.yml +++ b/.github/workflows/paimon-python-checks.yml @@ -145,7 +145,7 @@ jobs: else python -m pip install --upgrade pip pip install torch --index-url https://download.pytorch.org/whl/cpu - python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6' + python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6' pypaimon-rust==0.2.0 python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION }}' -i https://pypi.org/simple/ if python -c "import sys; sys.exit(0 if sys.version_info >= (3, 11) else 1)"; then python -m pip install vortex-data==0.70.0 @@ -163,15 +163,6 @@ jobs: maturin build --release pip install target/wheels/tantivy-*.whl - - name: Build and install pypaimon-rust from source - if: matrix.python-version != '3.6.15' - shell: bash - run: | - git clone https://github.com/apache/paimon-rust.git /tmp/paimon-rust - cd /tmp/paimon-rust/bindings/python - maturin build --release -o dist - pip install dist/pypaimon_rust-*.whl - pip install 'datafusion>=52' - name: Run lint-python.sh shell: bash diff --git a/paimon-python/pypaimon/tests/blob_table_test.py b/paimon-python/pypaimon/tests/blob_table_test.py index c4e5a4d1bd3f..7261503450f2 100755 --- a/paimon-python/pypaimon/tests/blob_table_test.py +++ b/paimon-python/pypaimon/tests/blob_table_test.py @@ -412,7 +412,7 @@ def test_data_blob_writer_write_operations(self): blob_writer.close() def test_data_blob_writer_write_large_blob(self): - """Test DataBlobWriter with very large blob data (50MB per item) in 10 batches.""" + """Test DataBlobWriter with large blob data (5MB per item) in 10 batches.""" from pypaimon import Schema # Create schema with blob column @@ -436,28 +436,27 @@ def test_data_blob_writer_write_large_blob(self): write_builder = table.new_batch_write_builder() blob_writer = write_builder.new_write() - # Create 50MB blob data per item - # Using a pattern to make the data more realistic and compressible - target_size = 50 * 1024 * 1024 # 50MB in bytes + # Create 5MB blob data per item + target_size = 5 * 1024 * 1024 # 5MB in bytes blob_pattern = b'LARGE_BLOB_DATA_PATTERN_' + b'X' * 1024 # ~1KB pattern pattern_size = len(blob_pattern) repetitions = target_size // pattern_size large_blob_data = blob_pattern * repetitions - # Verify the blob size is approximately 50MB + # Verify the blob size is approximately 5MB blob_size_mb = len(large_blob_data) / (1024 * 1024) - self.assertGreater(blob_size_mb, 49) # Should be at least 49MB - self.assertLess(blob_size_mb, 51) # Should be less than 51MB + self.assertGreater(blob_size_mb, 4) # Should be at least 4MB + self.assertLess(blob_size_mb, 6) # Should be less than 6MB total_rows = 0 # Write 10 batches, each with 5 rows (50 rows total) - # Total data volume: 50 rows * 50MB = 2.5GB of blob data + # Total data volume: 50 rows * 5MB = 250MB of blob data for batch_num in range(10): batch_data = pa.Table.from_pydict({ 'id': [batch_num * 5 + i for i in range(5)], 'description': [f'Large blob batch {batch_num}, row {i}' for i in range(5)], - 'large_blob': [large_blob_data] * 5 # 5 rows per batch, each with 50MB blob + 'large_blob': [large_blob_data] * 5 # 5 rows per batch, each with 5MB blob }, schema=pa_schema) # Write each batch @@ -502,9 +501,9 @@ def test_data_blob_writer_write_large_blob(self): # Verify total data written (50 rows of normal data + 50 rows of blob data = 100 total) self.assertEqual(total_row_count, 50) - # Verify total file size is substantial (should be much larger than 2.5GB due to overhead) + # Verify total file size is substantial (should be at least 200MB) total_size_mb = total_file_size / (1024 * 1024) - self.assertGreater(total_size_mb, 2000) # Should be at least 2GB due to overhead + self.assertGreater(total_size_mb, 200) total_files = sum(len(commit_msg.new_files) for commit_msg in commit_messages) print(f"Total data written: {total_size_mb:.2f}MB across {total_files} files")