Skip to content

Commit f8a6ca1

Browse files
authored
FEAT: Pass all connection string params to mssql-py-core for bulk copy (#439)
### Work Item / Issue Reference <!-- IMPORTANT: Please follow the PR template guidelines below. For mssql-python maintainers: Insert your ADO Work Item ID below For external contributors: Insert Github Issue number below Only one reference is required - either GitHub issue OR ADO Work Item. --> <!-- mssql-python maintainers: ADO Work Item --> > [AB#42663](https://sqlclientdrivers.visualstudio.com/c6d89619-62de-46a0-8b46-70b92a84d85e/_workitems/edit/42663) <!-- External contributors: GitHub Issue --> > GitHub Issue: #<ISSUE_NUMBER> ------------------------------------------------------------------- ### Summary <!-- Insert your summary of changes below. Minimum 10 characters required. --> This pull request refactors and simplifies how ODBC connection string parameters are translated for use with the bulk copy feature in `mssql_python`. The key change is the introduction of a new helper function, `connstr_to_pycore_params`, which centralizes and standardizes the parameter mapping logic. This reduces code duplication, improves maintainability, and ensures sensitive data is handled more securely. **Bulk copy connection context refactor:** * Added a new helper function `connstr_to_pycore_params` in `mssql_python/helpers.py` to translate ODBC connection string parameters into the format expected by the py-core bulk copy path, including type conversion and key normalization. * Updated `mssql_python/cursor.py` to use `connstr_to_pycore_params` for building the `pycore_context` dictionary, replacing the previous inline mapping and logic. [[1]](diffhunk://#diff-deceea46ae01082ce8400e14fa02f4b7585afb7b5ed9885338b66494f5f38280L20-R20) [[2]](diffhunk://#diff-deceea46ae01082ce8400e14fa02f4b7585afb7b5ed9885338b66494f5f38280L2588-R2592) **Security and code quality improvements:** * Simplified sensitive data cleanup in the bulk copy context by iterating over a tuple of sensitive keys instead of repeating `pop` calls. * Removed redundant manual extraction and assignment of authentication parameters in favor of the new helper function, reducing the risk of accidentally logging or exposing credentials. **Documentation and maintainability:** * Added detailed documentation to the new helper function explaining its purpose, supported keys, and conversion logic. * Added a clarifying comment in `cursor.py` indicating the mapping from ODBC connection-string keywords. <!-- ### PR Title Guide > For feature requests FEAT: (short-description) > For non-feature requests like test case updates, config updates , dependency updates etc CHORE: (short-description) > For Fix requests FIX: (short-description) > For doc update requests DOC: (short-description) > For Formatting, indentation, or styling update STYLE: (short-description) > For Refactor, without any feature changes REFACTOR: (short-description) > For release related changes, without any feature changes RELEASE: #<RELEASE_VERSION> (short-description) ### Contribution Guidelines External contributors: - Create a GitHub issue first: https://github.com/microsoft/mssql-python/issues/new - Link the GitHub issue in the "GitHub Issue" section above - Follow the PR title format and provide a meaningful summary mssql-python maintainers: - Create an ADO Work Item following internal processes - Link the ADO Work Item in the "ADO Work Item" section above - Follow the PR title format and provide a meaningful summary -->
1 parent c4e647e commit f8a6ca1

6 files changed

Lines changed: 202 additions & 45 deletions

File tree

.github/workflows/pr-code-coverage.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -426,8 +426,8 @@ jobs:
426426
--arg covered_lines "${{ env.COVERED_LINES }}" \
427427
--arg total_lines "${{ env.TOTAL_LINES }}" \
428428
--arg patch_coverage_pct "${{ env.PATCH_COVERAGE_PCT }}" \
429-
--arg low_coverage_files "${{ env.LOW_COVERAGE_FILES }}" \
430-
--arg patch_coverage_summary "${{ env.PATCH_COVERAGE_SUMMARY }}" \
429+
--arg low_coverage_files "$LOW_COVERAGE_FILES" \
430+
--arg patch_coverage_summary "$PATCH_COVERAGE_SUMMARY" \
431431
--arg ado_url "${{ env.ADO_URL }}" \
432432
'{
433433
pr_number: $pr_number,

mssql_python/connection_string_parser.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,10 @@ def _normalize_params(params: Dict[str, str], warn_rejected: bool = True) -> Dic
108108
if normalized_key in _RESERVED_PARAMETERS:
109109
continue
110110

111-
# Parameter is allowed
112-
filtered[normalized_key] = value
111+
# First-wins: match ODBC behaviour where the first
112+
# occurrence of a synonym group takes precedence.
113+
if normalized_key not in filtered:
114+
filtered[normalized_key] = value
113115
else:
114116
# Parameter is not in allow-list
115117
# Note: In normal flow, this should be empty since parser validates first

mssql_python/cursor.py

Lines changed: 7 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import warnings
1818
from typing import List, Union, Any, Optional, Tuple, Sequence, TYPE_CHECKING, Iterable
1919
from mssql_python.constants import ConstantsDDBC as ddbc_sql_const, SQLTypes
20-
from mssql_python.helpers import check_error
20+
from mssql_python.helpers import check_error, connstr_to_pycore_params
2121
from mssql_python.logging import logger
2222
from mssql_python import ddbc_bindings
2323
from mssql_python.exceptions import (
@@ -2498,6 +2498,7 @@ def nextset(self) -> Union[bool, None]:
24982498
)
24992499
return True
25002500

2501+
# ── Mapping from ODBC connection-string keywords (lowercase, as _parse returns)
25012502
def _bulkcopy(
25022503
self,
25032504
table_name: str,
@@ -2632,38 +2633,10 @@ def _bulkcopy(
26322633
"Specify the target database explicitly to avoid accidentally writing to system databases."
26332634
)
26342635

2635-
# Build connection context for bulk copy library
2636-
# Note: Password is extracted separately to avoid storing it in the main context
2637-
# dict that could be accidentally logged or exposed in error messages.
2638-
trust_cert = params.get("trustservercertificate", "yes").lower() in ("yes", "true")
2639-
2640-
# Parse encryption setting from connection string
2641-
encrypt_param = params.get("encrypt")
2642-
if encrypt_param is not None:
2643-
encrypt_value = encrypt_param.strip().lower()
2644-
if encrypt_value in ("yes", "true", "mandatory", "required"):
2645-
encryption = "Required"
2646-
elif encrypt_value in ("no", "false", "optional"):
2647-
encryption = "Optional"
2648-
else:
2649-
# Pass through unrecognized values (e.g., "Strict") to the underlying driver
2650-
encryption = encrypt_param
2651-
else:
2652-
encryption = "Optional"
2653-
2654-
context = {
2655-
"server": params.get("server"),
2656-
"database": params.get("database"),
2657-
"trust_server_certificate": trust_cert,
2658-
"encryption": encryption,
2659-
}
2660-
2661-
# Build pycore_context with appropriate authentication.
2662-
# For Azure AD: acquire a FRESH token right now instead of reusing
2663-
# the one from connect() time — avoids expired-token errors when
2664-
# bulkcopy() is called long after the original connection.
2665-
pycore_context = dict(context)
2636+
# Translate parsed connection string into the dict py-core expects.
2637+
pycore_context = connstr_to_pycore_params(params)
26662638

2639+
# Token acquisition — only thing cursor must handle (needs azure-identity SDK)
26672640
if self.connection._auth_type:
26682641
# Fresh token acquisition for mssql-py-core connection
26692642
from mssql_python.auth import AADAuth
@@ -2680,10 +2653,6 @@ def _bulkcopy(
26802653
"Bulk copy: acquired fresh Azure AD token for auth_type=%s",
26812654
self.connection._auth_type,
26822655
)
2683-
else:
2684-
# SQL Server authentication — use uid/password from connection string
2685-
pycore_context["user_name"] = params.get("uid", "")
2686-
pycore_context["password"] = params.get("pwd", "")
26872656

26882657
pycore_connection = None
26892658
pycore_cursor = None
@@ -2722,9 +2691,8 @@ def _bulkcopy(
27222691
finally:
27232692
# Clear sensitive data to minimize memory exposure
27242693
if pycore_context:
2725-
pycore_context.pop("password", None)
2726-
pycore_context.pop("user_name", None)
2727-
pycore_context.pop("access_token", None)
2694+
for key in ("password", "user_name", "access_token"):
2695+
pycore_context.pop(key, None)
27282696
# Clean up bulk copy resources
27292697
for resource in (pycore_cursor, pycore_connection):
27302698
if resource and hasattr(resource, "close"):

mssql_python/helpers.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,97 @@ def _sanitize_for_logging(input_val: Any, max_length: int = max_log_length) -> s
250250
return True, None, sanitized_attr, sanitized_val
251251

252252

253+
def connstr_to_pycore_params(params: dict) -> dict:
254+
"""Translate parsed ODBC connection-string params for py-core's bulk copy path.
255+
256+
When ``cursor.bulkcopy()`` is called, mssql-python opens a *separate*
257+
connection through mssql-py-core.
258+
py-core's ``connection.rs`` expects a Python dict with snake_case keys —
259+
different from the ODBC-style keys that ``_ConnectionStringParser._parse``
260+
returns.
261+
262+
This function bridges that gap: it maps lowercase ODBC keys (e.g.
263+
``"trustservercertificate"``) to py-core keys (``"trust_server_certificate"``)
264+
and converts numeric strings to ``int`` for timeout/size params.
265+
Boolean params (TrustServerCertificate, MultiSubnetFailover) are passed as
266+
strings — ``connection.rs`` validates Yes/No and rejects invalid values.
267+
Unrecognised keys are silently dropped.
268+
"""
269+
# Only keys listed below are forwarded to py-core.
270+
# Unknown/reserved keys (app, workstationid, language, connect_timeout,
271+
# mars_connection) are silently dropped here. In the normal connect()
272+
# path the parser validates keywords first (validate_keywords=True),
273+
# but bulkcopy parses with validation off, so this mapping is the
274+
# authoritative filter in that path.
275+
key_map = {
276+
# auth / credentials
277+
"uid": "user_name",
278+
"pwd": "password",
279+
"trusted_connection": "trusted_connection",
280+
"authentication": "authentication",
281+
# server (accept parser synonyms)
282+
"server": "server",
283+
"addr": "server",
284+
"address": "server",
285+
# database
286+
"database": "database",
287+
"applicationintent": "application_intent",
288+
# encryption / TLS (include snake_case alias the parser may emit)
289+
"encrypt": "encryption",
290+
"trustservercertificate": "trust_server_certificate",
291+
"trust_server_certificate": "trust_server_certificate",
292+
"hostnameincertificate": "host_name_in_certificate",
293+
"servercertificate": "server_certificate",
294+
# Kerberos
295+
"serverspn": "server_spn",
296+
# network
297+
"multisubnetfailover": "multi_subnet_failover",
298+
"ipaddresspreference": "ip_address_preference",
299+
"keepalive": "keep_alive",
300+
"keepaliveinterval": "keep_alive_interval",
301+
# sizing / limits ("packet size" with space is a common pyodbc-ism)
302+
"packetsize": "packet_size",
303+
"packet size": "packet_size",
304+
"connectretrycount": "connect_retry_count",
305+
"connectretryinterval": "connect_retry_interval",
306+
}
307+
int_keys = {
308+
"packet_size",
309+
"connect_retry_count",
310+
"connect_retry_interval",
311+
"keep_alive",
312+
"keep_alive_interval",
313+
}
314+
315+
pycore_params: dict = {}
316+
317+
for connstr_key, pycore_key in key_map.items():
318+
raw_value = params.get(connstr_key)
319+
if raw_value is None:
320+
continue
321+
322+
# First-wins: match ODBC behaviour — first synonym in the
323+
# connection string takes precedence (e.g. Addr before Server).
324+
if pycore_key in pycore_params:
325+
continue
326+
327+
# ODBC values are always strings; py-core expects native types for int keys.
328+
# Boolean params (trust_server_certificate, multi_subnet_failover) are passed
329+
# as strings — all Yes/No validation is in connection.rs for single-location
330+
# consistency with Encrypt, ApplicationIntent, IPAddressPreference, etc.
331+
if pycore_key in int_keys:
332+
# Numeric params (timeouts, packet size, etc.) — skip on bad input
333+
try:
334+
pycore_params[pycore_key] = int(raw_value)
335+
except (ValueError, TypeError):
336+
pass # let py-core fall back to its compiled-in default
337+
else:
338+
# String params (server, database, encryption, etc.) — pass through
339+
pycore_params[pycore_key] = raw_value
340+
341+
return pycore_params
342+
343+
253344
# Settings functionality moved here to avoid circular imports
254345

255346
# Initialize the locale setting only once at module import time

tests/test_010_connection_string_parser.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,3 +440,99 @@ def test_incomplete_entry_recovery(self):
440440
# Should have error about incomplete 'Server'
441441
errors = exc_info.value.errors
442442
assert any("Server" in err and "Incomplete specification" in err for err in errors)
443+
444+
445+
class TestSynonymFirstWins:
446+
"""
447+
Verify that _normalize_params uses first-wins for synonym keys.
448+
449+
ODBC Driver 18 behaviour (confirmed via live test against sqlcconn.cpp):
450+
- Same key repeated → first-wins (fFromAttrOrProp guard)
451+
- Addr vs Address → same KEY_ADDR slot, first-wins
452+
- Addr/Address vs Server → separate slots, Addr/Address takes priority
453+
454+
_ConnectionStringParser._parse() rejects exact duplicate keys outright.
455+
These tests cover synonyms that map to the same canonical key during
456+
normalization (e.g. addr/address/server → "Server").
457+
"""
458+
459+
@staticmethod
460+
def _normalize(raw: dict) -> dict:
461+
"""Shorthand for calling _normalize_params with warnings suppressed."""
462+
return _ConnectionStringParser._normalize_params(raw, warn_rejected=False)
463+
464+
# ---- server / addr / address synonyms --------------------------------
465+
466+
def test_server_then_addr_first_wins(self):
467+
"""Server=A;Addr=B → first-wins keeps A."""
468+
result = self._normalize({"server": "hostA", "addr": "hostB"})
469+
assert result["Server"] == "hostA"
470+
471+
def test_addr_then_server_first_wins(self):
472+
"""Addr=A;Server=B → first-wins keeps A."""
473+
result = self._normalize({"addr": "hostA", "server": "hostB"})
474+
assert result["Server"] == "hostA"
475+
476+
def test_address_then_server_first_wins(self):
477+
"""Address=A;Server=B → first-wins keeps A."""
478+
result = self._normalize({"address": "hostA", "server": "hostB"})
479+
assert result["Server"] == "hostA"
480+
481+
def test_addr_then_address_first_wins(self):
482+
"""Addr=A;Address=B → first-wins keeps A."""
483+
result = self._normalize({"addr": "hostA", "address": "hostB"})
484+
assert result["Server"] == "hostA"
485+
486+
def test_all_three_server_synonyms_first_wins(self):
487+
"""Addr=A;Address=B;Server=C → first-wins keeps A."""
488+
result = self._normalize({"addr": "hostA", "address": "hostB", "server": "hostC"})
489+
assert result["Server"] == "hostA"
490+
491+
def test_server_only_no_synonyms(self):
492+
"""Single key has no conflict."""
493+
result = self._normalize({"server": "hostA"})
494+
assert result["Server"] == "hostA"
495+
496+
# ---- trustservercertificate / trust_server_certificate synonyms ------
497+
498+
def test_trustservercertificate_then_snake_case_first_wins(self):
499+
"""trustservercertificate=Yes;trust_server_certificate=No → first-wins keeps Yes."""
500+
result = self._normalize(
501+
{"trustservercertificate": "Yes", "trust_server_certificate": "No"}
502+
)
503+
assert result["TrustServerCertificate"] == "Yes"
504+
505+
def test_snake_case_then_trustservercertificate_first_wins(self):
506+
"""trust_server_certificate=No;trustservercertificate=Yes → first-wins keeps No."""
507+
result = self._normalize(
508+
{"trust_server_certificate": "No", "trustservercertificate": "Yes"}
509+
)
510+
assert result["TrustServerCertificate"] == "No"
511+
512+
# ---- packetsize / "packet size" synonyms -----------------------------
513+
514+
def test_packetsize_then_packet_space_first_wins(self):
515+
"""packetsize=8192;packet size=4096 → first-wins keeps 8192."""
516+
result = self._normalize({"packetsize": "8192", "packet size": "4096"})
517+
assert result["PacketSize"] == "8192"
518+
519+
def test_packet_space_then_packetsize_first_wins(self):
520+
"""packet size=4096;packetsize=8192 → first-wins keeps 4096."""
521+
result = self._normalize({"packet size": "4096", "packetsize": "8192"})
522+
assert result["PacketSize"] == "4096"
523+
524+
# ---- non-synonym keys are unaffected ---------------------------------
525+
526+
def test_different_keys_both_kept(self):
527+
"""Non-synonym keys should both be present."""
528+
result = self._normalize({"server": "host", "database": "mydb", "uid": "sa"})
529+
assert result == {"Server": "host", "Database": "mydb", "UID": "sa"}
530+
531+
# ---- reserved keys filtered regardless of order ----------------------
532+
533+
def test_reserved_keys_always_filtered(self):
534+
"""Driver and APP are always stripped, even when first."""
535+
result = self._normalize({"driver": "foo", "server": "host", "app": "bar"})
536+
assert "Driver" not in result
537+
assert "APP" not in result
538+
assert result["Server"] == "host"

tests/test_011_connection_string_allowlist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ def test__normalize_params_handles_address_variants(self):
130130
"""Test filtering handles address/addr/server as synonyms."""
131131
params = {"address": "addr1", "addr": "addr2", "server": "server1"}
132132
filtered = _ConnectionStringParser._normalize_params(params, warn_rejected=False)
133-
# All three are synonyms that map to 'Server', last one wins
134-
assert filtered["Server"] == "server1"
133+
# All three are synonyms that map to 'Server', first one wins
134+
assert filtered["Server"] == "addr1"
135135
assert "Address" not in filtered
136136
assert "Addr" not in filtered
137137

0 commit comments

Comments
 (0)