diff --git a/synapseclient/core/upload/upload_utils.py b/synapseclient/core/upload/upload_utils.py index 9175dfd0e..99c7fd429 100644 --- a/synapseclient/core/upload/upload_utils.py +++ b/synapseclient/core/upload/upload_utils.py @@ -70,6 +70,9 @@ def get_partial_dataframe_chunk( header=False, index=False, float_format="%.12g", + doublequote=False, + escapechar="\\", + quoting=0, **(to_csv_kwargs or {}), ) number_of_bytes_in_buffer = buffer.tell() diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index dc583975b..8e5083005 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -141,6 +141,7 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]: def convert_dtypes_to_json_serializable(df): """ Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types. + Replace both Ellipsis and pandas NA within nested structures which are not JSON serializable types. Also, convert the ROW_ID, ROW_VERSION, and ROW_ID.1 columns to int columns which are JSON serializable types. Arguments: df: The dataframe to convert the dtypes of. @@ -163,6 +164,29 @@ def convert_dtypes_to_json_serializable(df): "datetime_list_col": [[datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], [datetime(2021, 1, 4), datetime(2021, 1, 5), datetime(2021, 1, 6)], None, [datetime(2021, 1, 7), datetime(2021, 1, 8), datetime(2021, 1, 9)]], "entityid_list_col": [["syn123", "syn456", None], ["syn101", "syn102", "syn103"], None, ["syn104", "syn105", "syn106"]], "userid_list_col": [["user1", "user2", "user3"], ["user4", "user5", None], None, ["user7", "user8", "user9"]], + "json_col_with_quotes": [ + { + "id": 1, + "description": 'Text with "quotes" in the description field', + "references": [] + }, + { + "id": 2, + "description": 'Another description with "quoted text" here',` + "references": ["ref1", "ref2"] + }, + { + "id": 3, + "description": 'Description containing "multiple" quoted "words"', + "references": [...] + } + { + "id": 4, + "description": 'Description containing apostrophes sage\'s', + "references": [...] + } + + ], }).convert_dtypes() df = convert_dtypes_to_json_serializable(df) print(df) @@ -170,9 +194,38 @@ def convert_dtypes_to_json_serializable(df): import pandas as pd for col in df.columns: - df[col] = ( - df[col].replace({pd.NA: None}).astype(object) - ) # this will convert the int64 and float64 columns to object columns + sample_values = df[col].dropna() + if len(sample_values): + + def _serialize_json_value(x): + if isinstance(x, (list, dict)): + + def _reformat_special_values(obj): + if obj is ...: + return "..." + if obj is pd.NA: + return None + if isinstance(obj, dict): + return { + k: _reformat_special_values(v) for k, v in obj.items() + } + if isinstance(obj, list): + return [_reformat_special_values(item) for item in obj] + return obj + + cleaned_x = _reformat_special_values(x) + return cleaned_x + # Handle standalone ellipsis + if x is ...: + return "..." + return x + + df[col] = df[col].apply(lambda x: _serialize_json_value(x)) + + # restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype + df[col] = df[col].convert_dtypes() + df[col] = df[col].replace({pd.NA: None}).astype(object) + # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION) if col in [ "ROW_ID", @@ -2809,7 +2862,6 @@ async def main(): timeout=timeout, synapse_client=synapse_client, ) - if download_location: return csv_path @@ -4031,8 +4083,9 @@ async def _chunk_and_upload_df( to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv` function when writing the data to a CSV file. """ + # Serializes dict/list values to JSON strings + df = convert_dtypes_to_json_serializable(df) # Loop over the rows of the DF to determine the size/boundries we'll be uploading - chunks_to_upload = [] size_of_chunk = 0 buffer = BytesIO() diff --git a/tests/integration/synapseclient/models/async/test_table_async.py b/tests/integration/synapseclient/models/async/test_table_async.py index b2d49e2ca..432d8e9bf 100644 --- a/tests/integration/synapseclient/models/async/test_table_async.py +++ b/tests/integration/synapseclient/models/async/test_table_async.py @@ -10,6 +10,7 @@ import pandas as pd import pytest +from pandas.api.types import is_object_dtype from pytest_mock import MockerFixture import synapseclient.models.mixins.asynchronous_job as asynchronous_job_module @@ -351,6 +352,8 @@ async def test_store_rows_from_csv_infer_columns( "float_string": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -512,6 +515,8 @@ async def test_store_rows_from_manually_defined_columns( "float_column": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -977,6 +982,179 @@ async def test_store_rows_as_large_df_being_split_and_uploaded( # AND The spy should have been called in multiple batches assert spy_send_job.call_count == 1 + async def test_store_rows_with_quotes_and_apostrophes_ellipses( + self, project_model: Project + ) -> None: + """Test columns with quotes, apostrophes, and ellipses (in lists, dicts, and standalone) in values are properly stored and retrieved in the tables""" + # GIVEN a table with a JSON column + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + Column( + name="string_list_with_ellipses", column_type=ColumnType.STRING_LIST + ), + Column(name="string_col_with_ellipses", column_type=ColumnType.STRING), + Column(name="int_list_with_pa_na", column_type=ColumnType.INTEGER_LIST), + Column(name="nullable_int", column_type=ColumnType.INTEGER), + Column(name="nullable_float", column_type=ColumnType.DOUBLE), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with quotes in JSON values + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3, 4, 5, 6, 7], + "json_data": [ + {"description": 'Text with "quotes" here', "value": 100}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + { + "description": ..., + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, ...], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": ...}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value + ], + "string_list_with_ellipses": [ + ["a", "b", ...], + ["d", ..., "f"], + ["g", "h", "i"], + [...], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], + ], + "string_col_with_ellipses": [ + "value1", + ..., + "value3", + ..., + "value6", + ..., + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + pd.NA, + [7, 8, 9], + pd.NA, + [11, 12, 13], + pd.NA, + [15, 16, 17], + ], + "nullable_int": pd.array([10, pd.NA, 30, pd.NA, 31, pd.NA, 32]), + "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA, 3.4, pd.NA, 3.5]), + } + ) + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + # THEN I can query the table and retrieve the data correctly + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + # AND the JSON data should be properly preserved with quotes + assert len(results) == 7 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3, 4, 5, 6, 7], + "json_data": [ + {"description": 'Text with "quotes" here', "value": 100}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + { + "description": "...", + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, "..."], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": "..."}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value + ], + "string_list_with_ellipses": [ + ["a", "b", "..."], + ["d", "...", "f"], + ["g", "h", "i"], + ["..."], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], + ], + "string_col_with_ellipses": [ + "value1", + "...", + "value3", + "...", + "value6", + "...", + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + [], + [7, 8, 9], + [], + [11, 12, 13], + [], + [15, 16, 17], + ], + "nullable_int": pd.array([10, None, 30, None, 31, None, 32]), + "nullable_float": pd.array([1.1, None, 3.3, None, 3.4, None, 3.5]), + } + ) + assert is_object_dtype(results.json_data) + assert is_object_dtype(results.int_list_with_pa_na) + assert is_object_dtype(results.nullable_int) + assert is_object_dtype(results.nullable_float) + + expected_result = expected_result.convert_dtypes() + expected_result = expected_result.replace({pd.NA: None}) + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) + class TestUpsertRows: @pytest.fixture(autouse=True, scope="function") @@ -1549,9 +1727,14 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) + # import pdb; pdb.set_trace() pd.testing.assert_frame_equal( results_after_insert, expected_results, check_dtype=False ) + # Create a second test file to update references path2 = utils.make_bogus_data_file() self.schedule_for_cleanup(path2) @@ -1733,7 +1916,10 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) pd.testing.assert_frame_equal(results, expected_results, check_dtype=False) + # WHEN I upsert with multiple primary keys and null values multi_key_data = pd.DataFrame( { diff --git a/tests/integration/synapseclient/models/synchronous/test_table.py b/tests/integration/synapseclient/models/synchronous/test_table.py index d0629b75d..6178de523 100644 --- a/tests/integration/synapseclient/models/synchronous/test_table.py +++ b/tests/integration/synapseclient/models/synchronous/test_table.py @@ -9,6 +9,7 @@ import pandas as pd import pytest +from pandas.api.types import is_object_dtype from pytest_mock import MockerFixture import synapseclient.models.mixins.asynchronous_job as asynchronous_job_module @@ -325,6 +326,8 @@ def test_store_rows_from_csv_infer_columns( "float_string": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -474,6 +477,8 @@ def test_store_rows_from_manually_defined_columns( "float_column": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -927,6 +932,178 @@ def test_store_rows_as_large_df_being_split_and_uploaded( # AND The spy should have been called in multiple batches assert spy_send_job.call_count == 1 + def test_store_rows_with_quotes_and_apostrophes_ellipses( + self, project_model: Project + ) -> None: + """Test columns with quotes, apostrophes, and ellipses (in lists, dicts, and standalone) in values are properly stored and retrieved in the tables""" + # GIVEN a table with a JSON column + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + Column( + name="string_list_with_ellipses", column_type=ColumnType.STRING_LIST + ), + Column(name="string_col_with_ellipses", column_type=ColumnType.STRING), + Column(name="int_list_with_pa_na", column_type=ColumnType.INTEGER_LIST), + Column(name="nullable_int", column_type=ColumnType.INTEGER), + Column(name="nullable_float", column_type=ColumnType.DOUBLE), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with quotes in JSON values + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3, 4, 5, 6, 7], + "json_data": [ + {"description": 'Text with "quotes" here', "value": 100}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + { + "description": ..., + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, ...], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": ...}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value + ], + "string_list_with_ellipses": [ + ["a", "b", ...], + ["d", ..., "f"], + ["g", "h", "i"], + [...], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], + ], + "string_col_with_ellipses": [ + "value1", + ..., + "value3", + ..., + "value6", + ..., + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + pd.NA, + [7, 8, 9], + pd.NA, + [11, 12, 13], + pd.NA, + [15, 16, 17], + ], + "nullable_int": pd.array([10, pd.NA, 30, pd.NA, 31, pd.NA, 32]), + "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA, 3.4, pd.NA, 3.5]), + } + ) + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + # THEN I can query the table and retrieve the data correctly + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + # AND the JSON data should be properly preserved with quotes + assert len(results) == 7 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3, 4, 5, 6, 7], + "json_data": [ + {"description": 'Text with "quotes" here', "value": 100}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + { + "description": "...", + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, "..."], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": "..."}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value + ], + "string_list_with_ellipses": [ + ["a", "b", "..."], + ["d", "...", "f"], + ["g", "h", "i"], + ["..."], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], + ], + "string_col_with_ellipses": [ + "value1", + "...", + "value3", + "...", + "value6", + "...", + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + [], + [7, 8, 9], + [], + [11, 12, 13], + [], + [15, 16, 17], + ], + "nullable_int": pd.array([10, None, 30, None, 31, None, 32]), + "nullable_float": pd.array([1.1, None, 3.3, None, 3.4, None, 3.5]), + } + ) + assert is_object_dtype(results.json_data) + assert is_object_dtype(results.int_list_with_pa_na) + assert is_object_dtype(results.nullable_int) + assert is_object_dtype(results.nullable_float) + + expected_result = expected_result.convert_dtypes() + expected_result = expected_result.replace({pd.NA: None}) + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) + class TestUpsertRows: @pytest.fixture(autouse=True, scope="function") @@ -1488,6 +1665,9 @@ def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) + # import pdb; pdb.set_trace() pd.testing.assert_frame_equal( results_after_insert, expected_results, check_dtype=False ) @@ -1675,6 +1855,8 @@ def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) pd.testing.assert_frame_equal(results, expected_results, check_dtype=False) # WHEN I upsert with multiple primary keys and null values diff --git a/tests/unit/synapseclient/mixins/unit_test_table_components.py b/tests/unit/synapseclient/mixins/unit_test_table_components.py index 5f8b91566..0db8ecddd 100644 --- a/tests/unit/synapseclient/mixins/unit_test_table_components.py +++ b/tests/unit/synapseclient/mixins/unit_test_table_components.py @@ -1,3 +1,4 @@ +import json import os import re from collections import OrderedDict @@ -9,6 +10,7 @@ import numpy as np import pandas as pd import pytest +from pandas.api.types import is_float_dtype, is_integer_dtype, is_object_dtype from synapseclient import Synapse from synapseclient.api import ViewEntityType, ViewTypeMask @@ -37,6 +39,7 @@ _query_table_csv, _query_table_next_page, _query_table_row_set, + convert_dtypes_to_json_serializable, csv_to_pandas_df, ) from synapseclient.models.table_components import ( @@ -3968,3 +3971,208 @@ def test_csv_pandas_df_with_row_id_and_version_etag_in_index( ).convert_dtypes() # resolve datatype issue such as StringDtype vs object # THEN assert the dataframe is equal to the expected dataframe pd.testing.assert_frame_equal(df, expected_df) + + +class TestConvertDtypesToJsonSerializable: + """Tests for convert_dtypes_to_json_serializable function""" + + def test_no_conversion_when_no_na_in_column(self): + """Test that int64 and float64 columns are not converted to object dtype when no NA is present""" + df = pd.DataFrame({"int_col": [1, 2, 3, 4], "float_col": [1.1, 2.2, 3.3, 4.4]}) + assert df["int_col"].dtype == "int64" + assert df["float_col"].dtype == "float64" + + result = convert_dtypes_to_json_serializable(df) + assert is_object_dtype(result.int_col) + assert is_object_dtype(result.float_col) + assert list(result["int_col"]) == [1, 2, 3, 4] + assert list(result["float_col"]) == [1.1, 2.2, 3.3, 4.4] + + def test_convert_na_and_columns_to_object(self): + """Test that pd.NA values are converted to None for int64 and float64 columns by _serialize_json_value""" + df = pd.DataFrame( + { + "int_col": pd.array([1, 2, pd.NA, 4], dtype="Int64"), + "float_col": pd.array([1.1, 2.2, pd.NA, 4.4], dtype="Float64"), + } + ) + result = convert_dtypes_to_json_serializable(df) + assert is_object_dtype(result.int_col) + assert is_object_dtype(result.float_col) + assert list(result["int_col"]) == [1, 2, None, 4] + assert list(result["float_col"]) == [1.1, 2.2, None, 4.4] + + def test_row_columns_remain_int(self): + """Test that ROW_ID, ROW_VERSION, and ROW_ID.1 columns remain as int while other columns become object""" + # GIVEN a dataframe with special columns (ROW_ID, ROW_VERSION, ROW_ID.1) and a regular column + df = pd.DataFrame( + { + "ROW_ID": pd.array([1, 2, 3, 4], dtype="Int64"), + "ROW_VERSION": pd.array([5, 6, 7, 8], dtype="Int64"), + "ROW_ID.1": pd.array([9, 10, 11, 12], dtype="Int64"), + "other_col": [10, 20, 30, 40], # Use regular list without pd.NA + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN all special columns should remain as int while other_col should become object + assert is_integer_dtype(result.ROW_ID), "ROW_ID should remain integer dtype" + assert is_integer_dtype( + result.ROW_VERSION + ), "ROW_VERSION should remain int64 dtype" + assert is_integer_dtype( + result["ROW_ID.1"] + ), "ROW_ID.1 should remain int64 dtype" + assert is_object_dtype(result.other_col), "other_col should become object dtype" + + def test_ellipsis_handling_in_list(self): + """Test that Ellipsis (...) objects in lists are converted to '...' strings""" + # GIVEN a dataframe with Ellipsis in a list + df = pd.DataFrame({"list_with_ellipsis": [[1, 2, ...], [4, ..., 6]]}) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted to "..." in JSON string + assert result["list_with_ellipsis"].iloc[0] == [1, 2, "..."] + assert result["list_with_ellipsis"].iloc[1] == [4, "...", 6] + assert is_object_dtype(result.list_with_ellipsis) + + def test_ellipsis_handling_in_dict(self): + """Test that Ellipsis (...) objects in dicts are converted to '...' strings""" + # GIVEN a dataframe with Ellipsis in a dict + df = pd.DataFrame( + { + "dict_with_ellipsis": [ + {"id": 1, "data": ...}, + {"id": 2, "items": [1, ...]}, + ] + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted to "..." in JSON string + assert result.dict_with_ellipsis.iloc[0] == {"id": 1, "data": "..."} + assert result.dict_with_ellipsis.iloc[1] == {"id": 2, "items": [1, "..."]} + assert is_object_dtype(result.dict_with_ellipsis) + + def test_standalone_ellipsis(self): + """Test that standalone Ellipsis objects are converted to '...' strings""" + # GIVEN a dataframe with standalone Ellipsis + df = pd.DataFrame({"ellipsis_col": [1, ..., 3]}) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted to "..." + assert result["ellipsis_col"].iloc[0] == 1 + assert result["ellipsis_col"].iloc[1] == "..." + assert result["ellipsis_col"].iloc[2] == 3 + + def test_none_in_list_serialized_to_empty_list(self): + """Test that None values in list columns are serialized to '[]'""" + # GIVEN a dataframe with None in list column + df = pd.DataFrame({"list_col": [[1, 2, 3], pd.NA, [7, 8, 9]]}) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN None should be converted to "[]" + assert result["list_col"].iloc[0] == [1, 2, 3] + assert result["list_col"].iloc[1] == None + assert result["list_col"].iloc[2] == [7, 8, 9] + + def test_dict_with_quotes_in_values(self): + """Test that dicts with quotes in string values are properly handled""" + # GIVEN a dataframe with dict containing quotes + df = pd.DataFrame( + { + "dict_col": [ + {"description": 'Text with "quotes" here'}, + {"description": 'Another "quoted" text'}, + ] + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN the JSON string should be properly formatted + assert result["dict_col"].iloc[0] == {"description": 'Text with "quotes" here'} + assert result["dict_col"].iloc[1] == {"description": 'Another "quoted" text'} + assert is_object_dtype(result.dict_col) + + def test_empty_dataframe(self): + """Test that empty dataframe is handled correctly""" + # GIVEN an empty dataframe + df = pd.DataFrame() + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN it should return an empty dataframe + assert len(result) == 0 + assert len(result.columns) == 0 + + def test_mixed_column_types_no_conversion_needed(self): + """Test that multiple column types are handled correctly together""" + # GIVEN a dataframe with mixed column types + df = pd.DataFrame( + { + "ROW_ID": pd.array([1, 2, 3], dtype="Int64"), + "ROW_VERSION": pd.array([1, 1, 1], dtype="Int64"), + "int_col": [10, 20, 30], # Use regular list without pd.NA + "float_col": [1.1, 2.2, 3.3], + "string_col": ["a", "b", "c"], + "list_col": [[1, 2], [3, 4], None], + "dict_col": [{"id": 1}, {"id": 2}, {"id": 3}], + "bool_col": [True, False, True], + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN verify all conversions + pd.testing.assert_frame_equal(result, df) + + def test_nested_dict_with_ellipsis(self): + """Test that nested dicts with Ellipsis are properly handled""" + # GIVEN a dataframe with nested dict containing Ellipsis + df = pd.DataFrame( + { + "nested_dict": [ + {"outer": {"inner": ...}}, + {"data": {"list": [1, 2, ...]}}, + ] + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted in nested structures + assert result["nested_dict"].iloc[0] == {"outer": {"inner": "..."}} + assert result["nested_dict"].iloc[1] == {"data": {"list": [1, 2, "..."]}} + + def test_nullable_int64_with_pd_na(self): + """Test that Int64 columns with pd.NA get pd.NA converted to None by _serialize_json_value""" + # GIVEN a dataframe with nullable Int64 column containing pd.NA + df = pd.DataFrame( + {"nullable_int_col": pd.array([1, 2, pd.NA, 4, pd.NA], dtype="Int64")} + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN the column should be object type and pd.NA should be converted to None + assert is_object_dtype(result.nullable_int_col) + expected_result = pd.DataFrame( + {"nullable_int_col": [1, 2, None, 4, None]} + ).convert_dtypes() + pd.testing.assert_frame_equal(result, expected_result, check_dtype=False) + assert is_object_dtype(result.nullable_int_col)