Skip to content

Commit 1187a4a

Browse files
authored
FIX: Resolve paths with non-ASCII characters in Windows (#376)
1 parent 006f96a commit 1187a4a

2 files changed

Lines changed: 293 additions & 29 deletions

File tree

mssql_python/pybind/ddbc_bindings.cpp

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -845,44 +845,37 @@ std::string GetLastErrorMessage();
845845

846846
// TODO: Move this to Python
847847
std::string GetModuleDirectory() {
848+
namespace fs = std::filesystem;
848849
py::object module = py::module::import("mssql_python");
849850
py::object module_path = module.attr("__file__");
850851
std::string module_file = module_path.cast<std::string>();
851852

852-
#ifdef _WIN32
853-
// Windows-specific path handling
854-
char path[MAX_PATH];
855-
errno_t err = strncpy_s(path, MAX_PATH, module_file.c_str(), module_file.length());
856-
if (err != 0) {
857-
LOG("GetModuleDirectory: strncpy_s failed copying path - "
858-
"error_code=%d, path_length=%zu",
859-
err, module_file.length());
860-
return {};
861-
}
862-
PathRemoveFileSpecA(path);
863-
return std::string(path);
864-
#else
865-
// macOS/Unix path handling without using std::filesystem
866-
std::string::size_type pos = module_file.find_last_of('/');
867-
if (pos != std::string::npos) {
868-
std::string dir = module_file.substr(0, pos);
869-
return dir;
870-
}
871-
LOG("GetModuleDirectory: Could not extract directory from module path - "
872-
"path='%s'",
873-
module_file.c_str());
874-
return module_file;
875-
#endif
853+
// Use std::filesystem::path for cross-platform path handling
854+
// This properly handles UTF-8 encoded paths on all platforms
855+
fs::path modulePath(module_file);
856+
fs::path parentDir = modulePath.parent_path();
857+
858+
// Log path extraction for observability
859+
LOG("GetModuleDirectory: Extracted directory - "
860+
"original_path='%s', directory='%s'",
861+
module_file.c_str(), parentDir.string().c_str());
862+
863+
// Return UTF-8 encoded string for consistent handling
864+
// If parentDir is empty or invalid, subsequent operations (like LoadDriverLibrary)
865+
// will fail naturally with clear error messages
866+
return parentDir.string();
876867
}
877868

878869
// Platform-agnostic function to load the driver dynamic library
879870
DriverHandle LoadDriverLibrary(const std::string& driverPath) {
880871
LOG("LoadDriverLibrary: Attempting to load ODBC driver from path='%s'", driverPath.c_str());
881872

882873
#ifdef _WIN32
883-
// Windows: Convert string to wide string for LoadLibraryW
884-
std::wstring widePath(driverPath.begin(), driverPath.end());
885-
HMODULE handle = LoadLibraryW(widePath.c_str());
874+
// Windows: Use std::filesystem::path for proper UTF-8 to UTF-16 conversion
875+
// fs::path::c_str() returns wchar_t* on Windows with correct encoding
876+
namespace fs = std::filesystem;
877+
fs::path pathObj(driverPath);
878+
HMODULE handle = LoadLibraryW(pathObj.c_str());
886879
if (!handle) {
887880
LOG("LoadDriverLibrary: LoadLibraryW failed for path='%s' - %s", driverPath.c_str(),
888881
GetLastErrorMessage().c_str());
@@ -1013,8 +1006,8 @@ DriverHandle LoadDriverOrThrowException() {
10131006
fs::path dllDir = fs::path(moduleDir) / "libs" / "windows" / archDir;
10141007
fs::path authDllPath = dllDir / "mssql-auth.dll";
10151008
if (fs::exists(authDllPath)) {
1016-
HMODULE hAuth = LoadLibraryW(
1017-
std::wstring(authDllPath.native().begin(), authDllPath.native().end()).c_str());
1009+
// Use fs::path::c_str() which returns wchar_t* on Windows with proper encoding
1010+
HMODULE hAuth = LoadLibraryW(authDllPath.c_str());
10181011
if (hAuth) {
10191012
LOG("LoadDriverOrThrowException: mssql-auth.dll loaded "
10201013
"successfully from '%s'",
Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
"""
2+
Tests for UTF-8 path handling fix (Issue #370).
3+
4+
Verifies that the driver correctly handles paths containing non-ASCII
5+
characters on Windows (e.g., usernames like 'Thalén', folders like 'café').
6+
7+
Bug Summary:
8+
- GetModuleDirectory() used ANSI APIs (PathRemoveFileSpecA) which corrupted UTF-8 paths
9+
- LoadDriverLibrary() used broken UTF-8→UTF-16 conversion: std::wstring(path.begin(), path.end())
10+
- LoadDriverOrThrowException() used same broken pattern for mssql-auth.dll
11+
12+
Fix:
13+
- Use std::filesystem::path which handles encoding correctly on all platforms
14+
- fs::path::c_str() returns wchar_t* on Windows with proper UTF-16 encoding
15+
"""
16+
17+
import pytest
18+
import platform
19+
import sys
20+
import subprocess
21+
22+
import mssql_python
23+
from mssql_python import ddbc_bindings
24+
25+
26+
class TestPathHandlingCodePaths:
27+
"""
28+
Test that path handling code paths are exercised correctly.
29+
30+
These tests run by DEFAULT and verify the fixed C++ functions
31+
(GetModuleDirectory, LoadDriverLibrary) are working.
32+
"""
33+
34+
def test_module_import_exercises_path_handling(self):
35+
"""
36+
Verify module import succeeds - this exercises GetModuleDirectory().
37+
38+
When mssql_python imports, it calls:
39+
1. GetModuleDirectory() - to find module location
40+
2. LoadDriverLibrary() - to load ODBC driver
41+
3. LoadLibraryW() for mssql-auth.dll on Windows
42+
43+
If any of these fail due to path encoding issues, import fails.
44+
"""
45+
assert mssql_python is not None
46+
assert hasattr(mssql_python, "__file__")
47+
assert isinstance(mssql_python.__file__, str)
48+
49+
def test_module_path_is_valid_utf8(self):
50+
"""Verify module path is valid UTF-8 string."""
51+
module_path = mssql_python.__file__
52+
53+
# Should be encodable/decodable as UTF-8 without errors
54+
encoded = module_path.encode("utf-8")
55+
decoded = encoded.decode("utf-8")
56+
assert decoded == module_path
57+
58+
def test_connect_function_available(self):
59+
"""Verify connect function is available (proves ddbc_bindings loaded)."""
60+
assert hasattr(mssql_python, "connect")
61+
assert callable(mssql_python.connect)
62+
63+
def test_ddbc_bindings_loaded(self):
64+
"""Verify ddbc_bindings C++ module loaded successfully."""
65+
assert ddbc_bindings is not None
66+
67+
def test_connection_class_available(self):
68+
"""Verify Connection class from C++ bindings is accessible."""
69+
assert ddbc_bindings.Connection is not None
70+
71+
72+
class TestPathWithNonAsciiCharacters:
73+
"""
74+
Test path handling with non-ASCII characters in strings.
75+
76+
These tests verify that Python string operations with non-ASCII
77+
characters work correctly (prerequisite for the C++ fix to work).
78+
"""
79+
80+
# Non-ASCII test strings representing real-world scenarios
81+
NON_ASCII_PATHS = [
82+
"Thalén", # Swedish - the original issue reporter's username
83+
"café", # French
84+
"日本語", # Japanese
85+
"中文", # Chinese
86+
"über", # German
87+
"Müller", # German umlaut
88+
"España", # Spanish
89+
"Россия", # Russian
90+
"한국어", # Korean
91+
"Ñoño", # Spanish ñ
92+
"Ångström", # Swedish å
93+
]
94+
95+
@pytest.mark.parametrize("non_ascii_name", NON_ASCII_PATHS)
96+
def test_path_string_with_non_ascii(self, non_ascii_name):
97+
"""Test that Python can handle paths with non-ASCII characters."""
98+
# Simulate Windows-style path
99+
test_path = f"C:\\Users\\{non_ascii_name}\\project\\.venv\\Lib\\site-packages"
100+
101+
# Verify UTF-8 encoding/decoding works
102+
encoded = test_path.encode("utf-8")
103+
decoded = encoded.decode("utf-8")
104+
assert decoded == test_path
105+
assert non_ascii_name in decoded
106+
107+
@pytest.mark.parametrize("non_ascii_name", NON_ASCII_PATHS)
108+
def test_pathlib_with_non_ascii(self, non_ascii_name, tmp_path):
109+
"""Test that pathlib handles non-ASCII directory names."""
110+
from pathlib import Path
111+
112+
test_dir = tmp_path / non_ascii_name
113+
test_dir.mkdir()
114+
assert test_dir.exists()
115+
116+
# Create a file in the non-ASCII directory
117+
test_file = test_dir / "test.txt"
118+
test_file.write_text("test content", encoding="utf-8")
119+
assert test_file.exists()
120+
121+
# Read back
122+
content = test_file.read_text(encoding="utf-8")
123+
assert content == "test content"
124+
125+
def test_path_with_multiple_non_ascii_segments(self, tmp_path):
126+
"""Test path with multiple non-ASCII directory segments."""
127+
from pathlib import Path
128+
129+
# Create nested directories with non-ASCII names
130+
nested = tmp_path / "Thalén" / "プロジェクト" / "código"
131+
nested.mkdir(parents=True)
132+
assert nested.exists()
133+
134+
def test_path_with_spaces_and_non_ascii(self, tmp_path):
135+
"""Test path with both spaces and non-ASCII characters."""
136+
from pathlib import Path
137+
138+
test_dir = tmp_path / "My Thalén Project"
139+
test_dir.mkdir()
140+
assert test_dir.exists()
141+
142+
143+
@pytest.mark.skipif(
144+
platform.system() != "Windows", reason="DLL loading and path encoding issue is Windows-specific"
145+
)
146+
class TestWindowsSpecificPathHandling:
147+
"""
148+
Windows-specific tests for path handling.
149+
150+
These tests verify Windows-specific behavior related to the fix.
151+
"""
152+
153+
def test_module_loads_on_windows(self):
154+
"""Verify module loads correctly on Windows."""
155+
import mssql_python
156+
157+
# If we get here, LoadLibraryW succeeded for:
158+
# - msodbcsql18.dll
159+
# - mssql-auth.dll (if exists)
160+
assert mssql_python.ddbc_bindings is not None
161+
162+
def test_libs_directory_exists(self):
163+
"""Verify the libs/windows directory structure exists."""
164+
from pathlib import Path
165+
166+
module_dir = Path(mssql_python.__file__).parent
167+
libs_dir = module_dir / "libs" / "windows"
168+
169+
# Check that at least one architecture directory exists
170+
arch_dirs = ["x64", "x86", "arm64"]
171+
found_arch = any((libs_dir / arch).exists() for arch in arch_dirs)
172+
assert found_arch, f"No architecture directory found in {libs_dir}"
173+
174+
def test_auth_dll_exists_if_libs_present(self):
175+
"""Verify mssql-auth.dll exists in the libs directory."""
176+
from pathlib import Path
177+
import struct
178+
179+
module_dir = Path(mssql_python.__file__).parent
180+
181+
# Determine architecture
182+
arch = "x64" if struct.calcsize("P") * 8 == 64 else "x86"
183+
# Check for ARM64
184+
185+
if platform.machine().lower() in ("arm64", "aarch64"):
186+
arch = "arm64"
187+
188+
auth_dll = module_dir / "libs" / "windows" / arch / "mssql-auth.dll"
189+
190+
if auth_dll.parent.exists():
191+
# If the directory exists, the DLL should be there
192+
assert auth_dll.exists(), f"mssql-auth.dll not found at {auth_dll}"
193+
194+
195+
class TestPathEncodingEdgeCases:
196+
"""Test edge cases in path encoding handling."""
197+
198+
def test_ascii_only_path_still_works(self):
199+
"""Verify ASCII-only paths continue to work (regression test)."""
200+
# If we got here, module loaded successfully
201+
assert mssql_python is not None
202+
203+
def test_path_with_spaces(self):
204+
"""Verify paths with spaces work (common Windows scenario)."""
205+
# Common Windows paths like "Program Files" have spaces
206+
# Module should load regardless
207+
assert mssql_python.__file__ is not None
208+
209+
def test_very_long_path_component(self, tmp_path):
210+
"""Test handling of long path components."""
211+
from pathlib import Path
212+
213+
# Windows MAX_PATH is 260, but individual components can be up to 255
214+
long_name = "a" * 200
215+
test_dir = tmp_path / long_name
216+
test_dir.mkdir()
217+
assert test_dir.exists()
218+
219+
@pytest.mark.parametrize(
220+
"char",
221+
[
222+
"é",
223+
"ñ",
224+
"ü",
225+
"ö",
226+
"å",
227+
"ø",
228+
"æ", # European diacritics
229+
"中",
230+
"日",
231+
"한", # CJK ideographs
232+
"α",
233+
"β",
234+
"γ", # Greek letters
235+
"й",
236+
"ж",
237+
"щ", # Cyrillic
238+
],
239+
)
240+
def test_individual_non_ascii_chars_utf8_roundtrip(self, char):
241+
"""Test UTF-8 encoding roundtrip for individual non-ASCII characters."""
242+
test_path = f"C:\\Users\\Test{char}User\\project"
243+
244+
# UTF-8 roundtrip
245+
encoded = test_path.encode("utf-8")
246+
decoded = encoded.decode("utf-8")
247+
assert decoded == test_path
248+
assert char in decoded
249+
250+
def test_emoji_in_path(self, tmp_path):
251+
"""Test path with emoji characters (supplementary plane)."""
252+
from pathlib import Path
253+
254+
# Emoji are in the supplementary planes (> U+FFFF)
255+
# This tests 4-byte UTF-8 sequences
256+
try:
257+
emoji_dir = tmp_path / "test_🚀_project"
258+
emoji_dir.mkdir()
259+
assert emoji_dir.exists()
260+
except OSError:
261+
# Some filesystems don't support emoji in filenames
262+
pytest.skip("Filesystem doesn't support emoji in filenames")
263+
264+
def test_mixed_scripts_in_path(self, tmp_path):
265+
"""Test path with mixed scripts (Latin + CJK + Cyrillic)."""
266+
from pathlib import Path
267+
268+
mixed_name = "Project_项目_Проект"
269+
test_dir = tmp_path / mixed_name
270+
test_dir.mkdir()
271+
assert test_dir.exists()

0 commit comments

Comments
 (0)