Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 22 additions & 29 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -845,44 +845,37 @@ std::string GetLastErrorMessage();

// TODO: Move this to Python
std::string GetModuleDirectory() {
namespace fs = std::filesystem;
py::object module = py::module::import("mssql_python");
py::object module_path = module.attr("__file__");
std::string module_file = module_path.cast<std::string>();

#ifdef _WIN32
// Windows-specific path handling
char path[MAX_PATH];
errno_t err = strncpy_s(path, MAX_PATH, module_file.c_str(), module_file.length());
if (err != 0) {
LOG("GetModuleDirectory: strncpy_s failed copying path - "
"error_code=%d, path_length=%zu",
err, module_file.length());
return {};
}
PathRemoveFileSpecA(path);
return std::string(path);
#else
// macOS/Unix path handling without using std::filesystem
std::string::size_type pos = module_file.find_last_of('/');
if (pos != std::string::npos) {
std::string dir = module_file.substr(0, pos);
return dir;
}
LOG("GetModuleDirectory: Could not extract directory from module path - "
"path='%s'",
module_file.c_str());
return module_file;
#endif
// Use std::filesystem::path for cross-platform path handling
// This properly handles UTF-8 encoded paths on all platforms
fs::path modulePath(module_file);
fs::path parentDir = modulePath.parent_path();

// Log path extraction for observability
LOG("GetModuleDirectory: Extracted directory - "
"original_path='%s', directory='%s'",
module_file.c_str(), parentDir.string().c_str());

// Return UTF-8 encoded string for consistent handling
// If parentDir is empty or invalid, subsequent operations (like LoadDriverLibrary)
// will fail naturally with clear error messages
return parentDir.string();
}

// Platform-agnostic function to load the driver dynamic library
DriverHandle LoadDriverLibrary(const std::string& driverPath) {
LOG("LoadDriverLibrary: Attempting to load ODBC driver from path='%s'", driverPath.c_str());

#ifdef _WIN32
// Windows: Convert string to wide string for LoadLibraryW
std::wstring widePath(driverPath.begin(), driverPath.end());
HMODULE handle = LoadLibraryW(widePath.c_str());
// Windows: Use std::filesystem::path for proper UTF-8 to UTF-16 conversion
// fs::path::c_str() returns wchar_t* on Windows with correct encoding
namespace fs = std::filesystem;
fs::path pathObj(driverPath);
HMODULE handle = LoadLibraryW(pathObj.c_str());
if (!handle) {
LOG("LoadDriverLibrary: LoadLibraryW failed for path='%s' - %s", driverPath.c_str(),
GetLastErrorMessage().c_str());
Expand Down Expand Up @@ -1013,8 +1006,8 @@ DriverHandle LoadDriverOrThrowException() {
fs::path dllDir = fs::path(moduleDir) / "libs" / "windows" / archDir;
fs::path authDllPath = dllDir / "mssql-auth.dll";
if (fs::exists(authDllPath)) {
HMODULE hAuth = LoadLibraryW(
std::wstring(authDllPath.native().begin(), authDllPath.native().end()).c_str());
// Use fs::path::c_str() which returns wchar_t* on Windows with proper encoding
HMODULE hAuth = LoadLibraryW(authDllPath.c_str());
if (hAuth) {
LOG("LoadDriverOrThrowException: mssql-auth.dll loaded "
"successfully from '%s'",
Expand Down
271 changes: 271 additions & 0 deletions tests/test_015_utf8_path_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
"""
Tests for UTF-8 path handling fix (Issue #370).

Verifies that the driver correctly handles paths containing non-ASCII
characters on Windows (e.g., usernames like 'Thalén', folders like 'café').

Bug Summary:
- GetModuleDirectory() used ANSI APIs (PathRemoveFileSpecA) which corrupted UTF-8 paths
- LoadDriverLibrary() used broken UTF-8→UTF-16 conversion: std::wstring(path.begin(), path.end())
- LoadDriverOrThrowException() used same broken pattern for mssql-auth.dll

Fix:
- Use std::filesystem::path which handles encoding correctly on all platforms
- fs::path::c_str() returns wchar_t* on Windows with proper UTF-16 encoding
"""

import pytest
import platform
import sys
import subprocess

import mssql_python
from mssql_python import ddbc_bindings


class TestPathHandlingCodePaths:
"""
Test that path handling code paths are exercised correctly.

These tests run by DEFAULT and verify the fixed C++ functions
(GetModuleDirectory, LoadDriverLibrary) are working.
"""

def test_module_import_exercises_path_handling(self):
"""
Verify module import succeeds - this exercises GetModuleDirectory().

When mssql_python imports, it calls:
1. GetModuleDirectory() - to find module location
2. LoadDriverLibrary() - to load ODBC driver
3. LoadLibraryW() for mssql-auth.dll on Windows

If any of these fail due to path encoding issues, import fails.
"""
assert mssql_python is not None
assert hasattr(mssql_python, "__file__")
assert isinstance(mssql_python.__file__, str)

def test_module_path_is_valid_utf8(self):
"""Verify module path is valid UTF-8 string."""
module_path = mssql_python.__file__

# Should be encodable/decodable as UTF-8 without errors
encoded = module_path.encode("utf-8")
decoded = encoded.decode("utf-8")
assert decoded == module_path

def test_connect_function_available(self):
"""Verify connect function is available (proves ddbc_bindings loaded)."""
assert hasattr(mssql_python, "connect")
assert callable(mssql_python.connect)

def test_ddbc_bindings_loaded(self):
"""Verify ddbc_bindings C++ module loaded successfully."""
assert ddbc_bindings is not None

def test_connection_class_available(self):
"""Verify Connection class from C++ bindings is accessible."""
assert ddbc_bindings.Connection is not None


class TestPathWithNonAsciiCharacters:
"""
Test path handling with non-ASCII characters in strings.

These tests verify that Python string operations with non-ASCII
characters work correctly (prerequisite for the C++ fix to work).
"""

# Non-ASCII test strings representing real-world scenarios
NON_ASCII_PATHS = [
"Thalén", # Swedish - the original issue reporter's username
"café", # French
"日本語", # Japanese
"中文", # Chinese
"über", # German
"Müller", # German umlaut
"España", # Spanish
"Россия", # Russian
"한국어", # Korean
"Ñoño", # Spanish ñ
"Ångström", # Swedish å
]

@pytest.mark.parametrize("non_ascii_name", NON_ASCII_PATHS)
def test_path_string_with_non_ascii(self, non_ascii_name):
"""Test that Python can handle paths with non-ASCII characters."""
# Simulate Windows-style path
test_path = f"C:\\Users\\{non_ascii_name}\\project\\.venv\\Lib\\site-packages"

# Verify UTF-8 encoding/decoding works
encoded = test_path.encode("utf-8")
decoded = encoded.decode("utf-8")
assert decoded == test_path
assert non_ascii_name in decoded

@pytest.mark.parametrize("non_ascii_name", NON_ASCII_PATHS)
def test_pathlib_with_non_ascii(self, non_ascii_name, tmp_path):
"""Test that pathlib handles non-ASCII directory names."""
from pathlib import Path

test_dir = tmp_path / non_ascii_name
test_dir.mkdir()
assert test_dir.exists()

# Create a file in the non-ASCII directory
test_file = test_dir / "test.txt"
test_file.write_text("test content", encoding="utf-8")
assert test_file.exists()

# Read back
content = test_file.read_text(encoding="utf-8")
assert content == "test content"

def test_path_with_multiple_non_ascii_segments(self, tmp_path):
"""Test path with multiple non-ASCII directory segments."""
from pathlib import Path

# Create nested directories with non-ASCII names
nested = tmp_path / "Thalén" / "プロジェクト" / "código"
nested.mkdir(parents=True)
assert nested.exists()

def test_path_with_spaces_and_non_ascii(self, tmp_path):
"""Test path with both spaces and non-ASCII characters."""
from pathlib import Path

test_dir = tmp_path / "My Thalén Project"
test_dir.mkdir()
assert test_dir.exists()


@pytest.mark.skipif(
platform.system() != "Windows", reason="DLL loading and path encoding issue is Windows-specific"
)
class TestWindowsSpecificPathHandling:
"""
Windows-specific tests for path handling.

These tests verify Windows-specific behavior related to the fix.
"""

def test_module_loads_on_windows(self):
"""Verify module loads correctly on Windows."""
import mssql_python

# If we get here, LoadLibraryW succeeded for:
# - msodbcsql18.dll
# - mssql-auth.dll (if exists)
assert mssql_python.ddbc_bindings is not None

def test_libs_directory_exists(self):
"""Verify the libs/windows directory structure exists."""
from pathlib import Path

module_dir = Path(mssql_python.__file__).parent
libs_dir = module_dir / "libs" / "windows"

# Check that at least one architecture directory exists
arch_dirs = ["x64", "x86", "arm64"]
found_arch = any((libs_dir / arch).exists() for arch in arch_dirs)
assert found_arch, f"No architecture directory found in {libs_dir}"

def test_auth_dll_exists_if_libs_present(self):
"""Verify mssql-auth.dll exists in the libs directory."""
from pathlib import Path
import struct

module_dir = Path(mssql_python.__file__).parent

# Determine architecture
arch = "x64" if struct.calcsize("P") * 8 == 64 else "x86"
# Check for ARM64

if platform.machine().lower() in ("arm64", "aarch64"):
arch = "arm64"

auth_dll = module_dir / "libs" / "windows" / arch / "mssql-auth.dll"

if auth_dll.parent.exists():
# If the directory exists, the DLL should be there
assert auth_dll.exists(), f"mssql-auth.dll not found at {auth_dll}"


class TestPathEncodingEdgeCases:
"""Test edge cases in path encoding handling."""

def test_ascii_only_path_still_works(self):
"""Verify ASCII-only paths continue to work (regression test)."""
# If we got here, module loaded successfully
assert mssql_python is not None

def test_path_with_spaces(self):
"""Verify paths with spaces work (common Windows scenario)."""
# Common Windows paths like "Program Files" have spaces
# Module should load regardless
assert mssql_python.__file__ is not None

def test_very_long_path_component(self, tmp_path):
"""Test handling of long path components."""
from pathlib import Path

# Windows MAX_PATH is 260, but individual components can be up to 255
long_name = "a" * 200
test_dir = tmp_path / long_name
test_dir.mkdir()
assert test_dir.exists()

@pytest.mark.parametrize(
"char",
[
"é",
"ñ",
"ü",
"ö",
"å",
"ø",
"æ", # European diacritics
"中",
"日",
"한", # CJK ideographs
"α",
"β",
"γ", # Greek letters
"й",
"ж",
"щ", # Cyrillic
],
)
def test_individual_non_ascii_chars_utf8_roundtrip(self, char):
"""Test UTF-8 encoding roundtrip for individual non-ASCII characters."""
test_path = f"C:\\Users\\Test{char}User\\project"

# UTF-8 roundtrip
encoded = test_path.encode("utf-8")
decoded = encoded.decode("utf-8")
assert decoded == test_path
assert char in decoded

def test_emoji_in_path(self, tmp_path):
"""Test path with emoji characters (supplementary plane)."""
from pathlib import Path

# Emoji are in the supplementary planes (> U+FFFF)
# This tests 4-byte UTF-8 sequences
try:
emoji_dir = tmp_path / "test_🚀_project"
emoji_dir.mkdir()
assert emoji_dir.exists()
except OSError:
# Some filesystems don't support emoji in filenames
pytest.skip("Filesystem doesn't support emoji in filenames")

def test_mixed_scripts_in_path(self, tmp_path):
"""Test path with mixed scripts (Latin + CJK + Cyrillic)."""
from pathlib import Path

mixed_name = "Project_项目_Проект"
test_dir = tmp_path / mixed_name
test_dir.mkdir()
assert test_dir.exists()
Loading