Skip to content

Commit 6d3196f

Browse files
nicknick
authored andcommitted
fixes of zip extraction and xml parsing & tests
1 parent da6c97d commit 6d3196f

File tree

9 files changed

+717
-60
lines changed

9 files changed

+717
-60
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,9 @@
44

55
/.vs
66
/.venv
7-
BUILDING.md
7+
BUILDING.md
8+
/.vscode
9+
/build
10+
test.py
11+
report.docx
12+
docx_comment_parser.pyd

CMakeLists.txt

Lines changed: 91 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,114 @@
11
cmake_minimum_required(VERSION 3.15)
22
project(docx_comment_parser VERSION 1.0.0 LANGUAGES CXX)
33

4-
# ─── Standard & optimisation flags ──────────────────────────────────────────
5-
set(CMAKE_CXX_STANDARD 17)
4+
# ─── Policy: normalise install() DESTINATION paths (fixes CMP0177 warning) ───
5+
# Python3_SITEARCH on MinGW contains backslashes which CMake 3.26+ normalises
6+
# automatically under the NEW policy. Setting it explicitly silences the warning
7+
# on all CMake versions ≥ 3.15 that support CMP0177.
8+
if(POLICY CMP0177)
9+
cmake_policy(SET CMP0177 NEW)
10+
endif()
11+
12+
# ─── C++ standard ─────────────────────────────────────────────────────────────
13+
set(CMAKE_CXX_STANDARD 17)
614
set(CMAKE_CXX_STANDARD_REQUIRED ON)
7-
set(CMAKE_CXX_EXTENSIONS OFF)
15+
set(CMAKE_CXX_EXTENSIONS OFF)
816

917
if(NOT CMAKE_BUILD_TYPE)
1018
set(CMAKE_BUILD_TYPE Release)
1119
endif()
1220

13-
# LTO for Release builds
14-
include(CheckIPOSupported)
15-
check_ipo_supported(RESULT _ipo_ok OUTPUT _ipo_err)
16-
if(_ipo_ok)
17-
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE ON)
21+
# ─── LTO (Release only, skip on MinGW where it is unreliable) ─────────────────
22+
if(NOT MINGW)
23+
include(CheckIPOSupported)
24+
check_ipo_supported(RESULT _ipo_ok OUTPUT _ipo_err)
25+
if(_ipo_ok)
26+
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE ON)
27+
endif()
1828
endif()
1929

20-
# ─── Dependencies ────────────────────────────────────────────────────────────
21-
# libxml2 not required - using built-in XML parser
22-
find_package(ZLIB REQUIRED)
30+
# ─── Dependencies ─────────────────────────────────────────────────────────────
31+
# No libxml2 required — self-contained XML parser.
32+
#
33+
# zlib strategy:
34+
# MSVC — no system zlib exists. zip_reader.cpp compiles the vendored
35+
# single-header inflate from vendor/zlib/zlib.h; no link step needed.
36+
# MinGW — zlib1.dll + libz.a ship with every MinGW-w64 installation.
37+
# Linux / macOS — system zlib (apt install zlib1g-dev / brew install zlib).
38+
if(NOT MSVC)
39+
find_package(ZLIB REQUIRED)
40+
endif()
2341

24-
# ─── Core shared library ─────────────────────────────────────────────────────
42+
# ─── Core shared library ─────────────────────────────────────────────────────
2543
add_library(docx_comment_parser SHARED
2644
src/docx_parser.cpp
2745
src/batch_parser.cpp
2846
src/zip_reader.cpp
2947
src/xml_parser.cpp
3048
)
3149

50+
# Tell the compiler which TUs are *building* the DLL so DOCX_API expands to
51+
# dllexport; consumers that only include the header get dllimport instead.
52+
target_compile_definitions(docx_comment_parser PRIVATE DOCX_BUILDING_DLL)
53+
3254
target_include_directories(docx_comment_parser
3355
PUBLIC
3456
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
3557
$<INSTALL_INTERFACE:include>
3658
PRIVATE
59+
# On MSVC the vendored zlib.h is included by zip_reader.cpp via a
60+
# relative path "../vendor/zlib/zlib.h". Adding vendor/ here makes
61+
# the path work regardless of which directory cl.exe is invoked from.
62+
$<$<CXX_COMPILER_ID:MSVC>:${CMAKE_CURRENT_SOURCE_DIR}/vendor>
3763
)
3864

3965
target_link_libraries(docx_comment_parser
4066
PRIVATE
41-
ZLIB::ZLIB
67+
# Link system zlib on non-MSVC platforms only.
68+
# On MSVC, inflate is compiled directly into zip_reader.cpp.
69+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:ZLIB::ZLIB>
4270
)
4371

44-
# Hide all symbols except those explicitly exported with DOCX_API
45-
set_target_properties(docx_comment_parser PROPERTIES
46-
CXX_VISIBILITY_PRESET hidden
47-
VISIBILITY_INLINES_HIDDEN ON
48-
VERSION ${PROJECT_VERSION}
49-
SOVERSION 1
50-
)
72+
# On MinGW, link the standard threading and networking libs that std::thread
73+
# and socket code pull in indirectly.
74+
if(MINGW)
75+
target_link_libraries(docx_comment_parser PRIVATE -lws2_32 -lmswsock)
76+
endif()
77+
78+
# Symbol visibility — ELF only. On Windows (PE/DLL) visibility is handled by
79+
# __declspec(dllexport/dllimport) in the header; the GCC attribute is a no-op
80+
# there but can confuse some linker versions, so guard it explicitly.
81+
if(NOT WIN32)
82+
set_target_properties(docx_comment_parser PROPERTIES
83+
CXX_VISIBILITY_PRESET hidden
84+
VISIBILITY_INLINES_HIDDEN ON
85+
)
86+
endif()
87+
88+
# VERSION / SOVERSION are ELF-only concepts (produce .so.1 symlinks on Linux).
89+
# MinGW/Windows uses a different DLL versioning mechanism; setting these
90+
# properties on a PE target causes the ld "error: ld returned 5" link failure.
91+
if(NOT WIN32)
92+
set_target_properties(docx_comment_parser PROPERTIES
93+
VERSION ${PROJECT_VERSION}
94+
SOVERSION 1
95+
)
96+
endif()
5197

5298
target_compile_options(docx_comment_parser PRIVATE
5399
$<$<CXX_COMPILER_ID:GNU,Clang>:-Wall -Wextra -Wpedantic>
54100
$<$<CONFIG:Release>:-O3 -DNDEBUG>
55101
)
56102

57-
# ─── Python extension (optional) ─────────────────────────────────────────────
103+
# ─── Python extension (optional) ─────────────────────────────────────────────
58104
option(BUILD_PYTHON_BINDINGS "Build Python bindings via pybind11" ON)
59105

60106
if(BUILD_PYTHON_BINDINGS)
61107
find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
62108
find_package(pybind11 CONFIG QUIET)
63109

64110
if(NOT pybind11_FOUND)
65-
# Try to locate pybind11 via pip-installed package
111+
# Locate pybind11 installed via pip
66112
execute_process(
67113
COMMAND ${Python3_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
68114
OUTPUT_VARIABLE _pybind11_cmake_dir
@@ -79,48 +125,55 @@ if(BUILD_PYTHON_BINDINGS)
79125
python/python_bindings.cpp
80126
)
81127

128+
# The extension builds *into* the DLL — it also needs DOCX_BUILDING_DLL
129+
# so that DOCX_API expands to dllexport when compiling its TU on Windows.
130+
target_compile_definitions(_docx_comment_parser PRIVATE DOCX_BUILDING_DLL)
131+
82132
target_include_directories(_docx_comment_parser PRIVATE
83133
${CMAKE_CURRENT_SOURCE_DIR}/include
84-
134+
# Vendor dir for MSVC (same reason as the main library target)
135+
$<$<CXX_COMPILER_ID:MSVC>:${CMAKE_CURRENT_SOURCE_DIR}/vendor>
85136
)
86137

87138
target_link_libraries(_docx_comment_parser PRIVATE
88139
docx_comment_parser
89-
90-
ZLIB::ZLIB
140+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:ZLIB::ZLIB>
91141
)
92142

93-
# Install alongside the Python package
143+
# CMP0177 NEW: CMake normalises the path before passing it to install(),
144+
# so backslashes from Python3_SITEARCH on Windows are converted to slashes.
94145
install(TARGETS _docx_comment_parser
95-
LIBRARY DESTINATION ${Python3_SITEARCH}
146+
LIBRARY DESTINATION "${Python3_SITEARCH}"
147+
RUNTIME DESTINATION "${Python3_SITEARCH}"
96148
)
97149
else()
98-
message(WARNING "pybind11 not found – Python bindings will not be built. "
99-
"Install with: pip install pybind11")
150+
message(WARNING
151+
"pybind11 not found — Python bindings will not be built.\n"
152+
"Install with: pip install pybind11")
100153
endif()
101154
endif()
102155

103156
# ─── Install rules ────────────────────────────────────────────────────────────
104157
include(GNUInstallDirs)
105158

106159
install(TARGETS docx_comment_parser
107-
EXPORT docx_comment_parserTargets
108-
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
109-
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
110-
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
111-
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
160+
EXPORT docx_comment_parserTargets
161+
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
162+
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
163+
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
164+
INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
112165
)
113166

114167
install(FILES
115168
include/docx_comment_parser.h
116169
include/zip_reader.h
117-
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/docx_comment_parser
170+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/docx_comment_parser"
118171
)
119172

120173
install(EXPORT docx_comment_parserTargets
121-
FILE docx_comment_parserTargets.cmake
122-
NAMESPACE docx::
123-
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/docx_comment_parser
174+
FILE docx_comment_parserTargets.cmake
175+
NAMESPACE docx::
176+
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/docx_comment_parser"
124177
)
125178

126179
# ─── Tests ────────────────────────────────────────────────────────────────────

include/docx_comment_parser.h

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,24 @@
11
#pragma once
22

3+
// ─── Symbol export/import macro ───────────────────────────────────────────────
4+
//
5+
// On Windows (MSVC and MinGW) DLL symbols must be explicitly exported by the
6+
// *building* translation unit and imported by *consuming* translation units.
7+
//
8+
// DOCX_BUILDING_DLL – defined by CMake / setup.py when compiling the library
9+
// itself; expands DOCX_API to __declspec(dllexport).
10+
// NOT defined when a downstream project includes this header,
11+
// so DOCX_API expands to __declspec(dllimport) instead.
12+
//
13+
// On Linux/macOS with -fvisibility=hidden the macro marks symbols as default
14+
// visibility so the linker exports them even though the TU default is hidden.
15+
316
#ifdef _WIN32
4-
#define DOCX_API __declspec(dllexport)
17+
#ifdef DOCX_BUILDING_DLL
18+
#define DOCX_API __declspec(dllexport)
19+
#else
20+
#define DOCX_API __declspec(dllimport)
21+
#endif
522
#else
623
#define DOCX_API __attribute__((visibility("default")))
724
#endif
@@ -43,12 +60,12 @@ struct DOCX_API CommentMetadata {
4360

4461
// --- content ---
4562
std::string text; // full plain-text of comment body
46-
std::string paragraph_style; // style of first paragraph inside comment
63+
std::string paragraph_style; // style of first paragraph inside comment
4764

4865
// --- anchoring ---
49-
std::string range_start_para_id; // w:commentRangeStart / paraId attribute (OOXML 2016+)
66+
std::string range_start_para_id; // w:commentRangeStart / paraId (OOXML 2016+)
5067
std::string range_end_para_id;
51-
std::string referenced_text; // the document text the comment is anchored to
68+
std::string referenced_text; // document text the comment is anchored to
5269

5370
// --- threading (OOXML 2016+) ---
5471
bool is_reply{false};
@@ -61,8 +78,8 @@ struct DOCX_API CommentMetadata {
6178
bool done{false}; // @w16cex:done
6279

6380
// --- document position ---
64-
int paragraph_index{-1}; // 0-based paragraph in document body
65-
int run_index{-1}; // 0-based run within that paragraph
81+
int paragraph_index{-1}; // 0-based paragraph in document body
82+
int run_index{-1}; // 0-based run within that paragraph
6683

6784
// Convenience: resolved full thread chain (only set on root comments)
6885
std::vector<int> thread_ids; // ordered list of ids in this thread
@@ -90,8 +107,8 @@ struct DOCX_API DocxParserError : std::runtime_error {
90107
explicit DocxParserError(const std::string& msg) : std::runtime_error(msg) {}
91108
};
92109

93-
struct DOCX_API DocxFileError : DocxParserError {
94-
explicit DocxFileError(const std::string& msg) : DocxParserError(msg) {}
110+
struct DOCX_API DocxFileError : DocxParserError {
111+
explicit DocxFileError(const std::string& msg) : DocxParserError(msg) {}
95112
};
96113

97114
struct DOCX_API DocxFormatError : DocxParserError {

setup.py

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,31 @@
77
88
Or build in-place:
99
python setup.py build_ext --inplace
10+
11+
Supported toolchains:
12+
Linux / macOS : GCC or Clang (uses -std=c++17, -lz)
13+
Windows MSVC : Visual Studio 2019+ (uses /std:c++17)
14+
Windows MinGW : MinGW-w64 via MSYS2 (uses -std=c++17, -lz)
1015
"""
1116

1217
from setuptools import setup, Extension
1318
import sys
1419
import os
20+
import subprocess
21+
22+
23+
def _is_mingw() -> bool:
24+
"""Return True when building with MinGW-w64 GCC under Windows."""
25+
if sys.platform != "win32":
26+
return False
27+
try:
28+
out = subprocess.check_output(
29+
["gcc", "--version"], stderr=subprocess.DEVNULL
30+
).decode()
31+
return "mingw" in out.lower() or "MINGW" in out
32+
except (FileNotFoundError, subprocess.CalledProcessError):
33+
return False
34+
1535

1636
# ─── Locate pybind11 headers ──────────────────────────────────────────────────
1737
try:
@@ -20,16 +40,43 @@
2040
except ImportError:
2141
raise RuntimeError("pybind11 not found. Install with: pip install pybind11")
2242

23-
# ─── Platform-specific flags ─────────────────────────────────────────────────
24-
if sys.platform == "win32":
25-
extra_compile_args = ["/std:c++17", "/O2", "/DNDEBUG", "/EHsc"]
26-
extra_link_args = []
43+
# ─── Platform / toolchain flags ──────────────────────────────────────────────
44+
IS_MINGW = _is_mingw()
45+
IS_MSVC = sys.platform == "win32" and not IS_MINGW
46+
47+
if IS_MSVC:
48+
# MSVC (cl.exe) — Visual Studio 2019 or later
49+
extra_compile_args = [
50+
"/std:c++17",
51+
"/O2",
52+
"/DNDEBUG",
53+
"/EHsc", # enable C++ exception handling
54+
"/DDOCX_BUILDING_DLL", # expand DOCX_API to __declspec(dllexport)
55+
]
56+
extra_link_args = [] # zlib linked via vcpkg / find_package
57+
58+
elif IS_MINGW:
59+
# MinGW-w64 GCC on Windows (MSYS2 / standalone)
60+
extra_compile_args = [
61+
"-std=c++17",
62+
"-O2",
63+
"-DNDEBUG",
64+
"-DDOCX_BUILDING_DLL", # expand DOCX_API to __declspec(dllexport)
65+
"-Wall",
66+
]
67+
extra_link_args = [
68+
"-lz", # zlib1.dll — ships with every MinGW installation
69+
"-lws2_32", # Winsock — required by std::thread on MinGW
70+
"-lmswsock",
71+
]
72+
2773
else:
74+
# Linux / macOS — GCC or Clang
2875
extra_compile_args = [
2976
"-std=c++17",
3077
"-O3",
3178
"-DNDEBUG",
32-
"-fvisibility=hidden",
79+
"-fvisibility=hidden", # hide all symbols except those marked DOCX_API
3380
]
3481
extra_link_args = ["-lz"]
3582

@@ -45,7 +92,7 @@
4592
ext = Extension(
4693
"docx_comment_parser",
4794
sources=sources,
48-
include_dirs=["include", pybind11_include],
95+
include_dirs=["include", "vendor", pybind11_include],
4996
extra_compile_args=extra_compile_args,
5097
extra_link_args=extra_link_args,
5198
language="c++",

src/xml_parser.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,10 @@ bool sax_parse(const char* data, std::size_t len, SaxHandler& h) {
148148

149149
// Processing instruction or XML declaration
150150
if (*p == '?') {
151-
while (p < end && *p != '>') ++p;
152-
if (p < end) ++p;
151+
++p;
152+
// Scan for the closing "?>" sequence, not just ">"
153+
while (p + 1 < end && !(p[0] == '?' && p[1] == '>')) ++p;
154+
if (p + 1 < end) p += 2; // skip "?>"
153155
continue;
154156
}
155157

0 commit comments

Comments
 (0)