diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py index 1ee04460..2ee4602f 100644 --- a/data_diff/databases/mysql.py +++ b/data_diff/databases/mysql.py @@ -8,6 +8,7 @@ Float, Decimal, Integer, + JSON, Text, TemporalType, FractionalType, @@ -68,6 +69,8 @@ class Dialect(BaseDialect): "tinytext": Text, # Boolean "boolean": Boolean, + # JSON + "json": JSON, } def quote(self, s: str) -> str: @@ -106,6 +109,15 @@ def md5_as_hex(self, s: str) -> str: return f"md5({s})" def normalize_timestamp(self, value: str, coltype: TemporalType) -> str: + # MySQL zero-date equivalences vs BigQuery: + # TIMESTAMP '0000-00-00 00:00:00' -> '1970-01-01 00:00:00.000000' (Unix epoch) + # DATETIME '0000-00-00 00:00:00' -> NULL + #if isinstance(coltype, Timestamp): + # epoch = "cast('1970-01-01 00:00:00' as datetime(6))" + # value = f"IF({value} = '0000-00-00 00:00:00', {epoch}, {value})" + #elif isinstance(coltype, Datetime): + # value = f"NULLIF({value}, '0000-00-00 00:00:00')" + if coltype.rounds: return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))") diff --git a/data_diff/utils.py b/data_diff/utils.py index 1d1405fd..1ad9fbac 100644 --- a/data_diff/utils.py +++ b/data_diff/utils.py @@ -512,10 +512,20 @@ def diff_int_dynamic_color_template(diff_value: int) -> str: return "0" -def _jsons_equiv(a: str, b: str): +def _jsons_equiv(a: Optional[str], b: Optional[str]): + # Treat Python None (DB null) as the JSON null literal so that a NULL on + # the MySQL side matches a 'null' string produced by TO_JSON_STRING(NULL) + # on the BigQuery side (or any other DB that serializes NULL as 'null'). + if a is None: + a = "null" + if b is None: + b = "null" + # Fast-path: identical strings don't need JSON parsing. + if a == b: + return True try: return json.loads(a) == json.loads(b) - except (ValueError, TypeError, json.decoder.JSONDecodeError): # not valid jsons + except (ValueError, TypeError): # covers json.JSONDecodeError (subclass of ValueError) return False diff --git a/tests/test_utils.py b/tests/test_utils.py index 712f3467..613c0b03 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,6 +11,7 @@ columns_removed_template, columns_added_template, columns_type_changed_template, + _jsons_equiv, ) from data_diff.__main__ import _remove_passwords_in_dict @@ -211,3 +212,43 @@ def test_columns_type_changed_template(self): output = columns_type_changed_template({"column1", "column2"}) self.assertIn("Type changed [2]: [green]", output) self.assertEqual(self.extract_columns_set(output), {"column1", "column2"}) + + +class TestJsonsEquiv(unittest.TestCase): + # --- None / null equivalence --- + def test_both_none(self): + """Two DB NULLs are equivalent.""" + self.assertTrue(_jsons_equiv(None, None)) + + def test_none_vs_json_null_string(self): + """DB NULL on one side, JSON 'null' string on the other, are equivalent.""" + self.assertTrue(_jsons_equiv(None, "null")) + self.assertTrue(_jsons_equiv("null", None)) + + def test_none_vs_json_string_null(self): + """DB NULL must NOT equal the JSON string literal \"null\".""" + self.assertFalse(_jsons_equiv(None, '"null"')) + self.assertFalse(_jsons_equiv('"null"', None)) + + # --- Identical strings fast-path --- + def test_identical_strings(self): + self.assertTrue(_jsons_equiv('{"a": 1}', '{"a": 1}')) + + # --- Semantic JSON equivalence --- + def test_equivalent_objects_different_whitespace(self): + self.assertTrue(_jsons_equiv('{"a":1,"b":2}', '{"b": 2, "a": 1}')) + + def test_equivalent_arrays(self): + self.assertTrue(_jsons_equiv("[1, 2, 3]", "[1,2,3]")) + + def test_different_values(self): + self.assertFalse(_jsons_equiv('{"a": 1}', '{"a": 2}')) + + def test_different_types(self): + self.assertFalse(_jsons_equiv("1", '"1"')) + + # --- Invalid JSON --- + def test_invalid_json_returns_false(self): + # Different invalid-JSON strings → False (can't parse either side) + self.assertFalse(_jsons_equiv("not-json", "also-not-json")) + self.assertFalse(_jsons_equiv('{"a": 1}', "not-json"))