Skip to content

Commit a2c33d5

Browse files
nicknick
authored andcommitted
Initial Commit#1
1 parent bceb512 commit a2c33d5

12 files changed

Lines changed: 2496 additions & 0 deletions

CMakeLists.txt

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
cmake_minimum_required(VERSION 3.15)
2+
project(docx_comment_parser VERSION 1.0.0 LANGUAGES CXX)
3+
4+
# ─── Standard & optimisation flags ──────────────────────────────────────────
5+
set(CMAKE_CXX_STANDARD 17)
6+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
7+
set(CMAKE_CXX_EXTENSIONS OFF)
8+
9+
if(NOT CMAKE_BUILD_TYPE)
10+
set(CMAKE_BUILD_TYPE Release)
11+
endif()
12+
13+
# LTO for Release builds
14+
include(CheckIPOSupported)
15+
check_ipo_supported(RESULT _ipo_ok OUTPUT _ipo_err)
16+
if(_ipo_ok)
17+
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE ON)
18+
endif()
19+
20+
# ─── Dependencies ────────────────────────────────────────────────────────────
21+
# libxml2 not required - using built-in XML parser
22+
find_package(ZLIB REQUIRED)
23+
24+
# ─── Core shared library ─────────────────────────────────────────────────────
25+
add_library(docx_comment_parser SHARED
26+
src/docx_parser.cpp
27+
src/batch_parser.cpp
28+
src/zip_reader.cpp
29+
src/xml_parser.cpp
30+
)
31+
32+
target_include_directories(docx_comment_parser
33+
PUBLIC
34+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
35+
$<INSTALL_INTERFACE:include>
36+
PRIVATE
37+
38+
)
39+
40+
target_link_libraries(docx_comment_parser
41+
PRIVATE
42+
43+
ZLIB::ZLIB
44+
)
45+
46+
# Hide all symbols except those explicitly exported with DOCX_API
47+
set_target_properties(docx_comment_parser PROPERTIES
48+
CXX_VISIBILITY_PRESET hidden
49+
VISIBILITY_INLINES_HIDDEN ON
50+
VERSION ${PROJECT_VERSION}
51+
SOVERSION 1
52+
)
53+
54+
target_compile_options(docx_comment_parser PRIVATE
55+
$<$<CXX_COMPILER_ID:GNU,Clang>:-Wall -Wextra -Wpedantic>
56+
$<$<CONFIG:Release>:-O3 -DNDEBUG>
57+
)
58+
59+
# ─── Python extension (optional) ─────────────────────────────────────────────
60+
option(BUILD_PYTHON_BINDINGS "Build Python bindings via pybind11" ON)
61+
62+
if(BUILD_PYTHON_BINDINGS)
63+
find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
64+
find_package(pybind11 CONFIG QUIET)
65+
66+
if(NOT pybind11_FOUND)
67+
# Try to locate pybind11 via pip-installed package
68+
execute_process(
69+
COMMAND ${Python3_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
70+
OUTPUT_VARIABLE _pybind11_cmake_dir
71+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
72+
)
73+
if(_pybind11_cmake_dir)
74+
list(APPEND CMAKE_PREFIX_PATH "${_pybind11_cmake_dir}")
75+
find_package(pybind11 CONFIG REQUIRED)
76+
endif()
77+
endif()
78+
79+
if(pybind11_FOUND)
80+
pybind11_add_module(_docx_comment_parser
81+
python/python_bindings.cpp
82+
)
83+
84+
target_include_directories(_docx_comment_parser PRIVATE
85+
${CMAKE_CURRENT_SOURCE_DIR}/include
86+
87+
)
88+
89+
target_link_libraries(_docx_comment_parser PRIVATE
90+
docx_comment_parser
91+
92+
ZLIB::ZLIB
93+
)
94+
95+
# Install alongside the Python package
96+
install(TARGETS _docx_comment_parser
97+
LIBRARY DESTINATION ${Python3_SITEARCH}
98+
)
99+
else()
100+
message(WARNING "pybind11 not found – Python bindings will not be built. "
101+
"Install with: pip install pybind11")
102+
endif()
103+
endif()
104+
105+
# ─── Install rules ────────────────────────────────────────────────────────────
106+
include(GNUInstallDirs)
107+
108+
install(TARGETS docx_comment_parser
109+
EXPORT docx_comment_parserTargets
110+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
111+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
112+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
113+
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
114+
)
115+
116+
install(FILES
117+
include/docx_comment_parser.h
118+
include/zip_reader.h
119+
include/xml_utils.h
120+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/docx_comment_parser
121+
)
122+
123+
install(EXPORT docx_comment_parserTargets
124+
FILE docx_comment_parserTargets.cmake
125+
NAMESPACE docx::
126+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/docx_comment_parser
127+
)
128+
129+
# ─── Tests ────────────────────────────────────────────────────────────────────
130+
option(BUILD_TESTS "Build test suite" ON)
131+
132+
if(BUILD_TESTS)
133+
enable_testing()
134+
add_subdirectory(tests)
135+
endif()

README.md

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# docx_comment_parser
2+
3+
A fast, memory-efficient C++17 shared library (DLL/SO) that extracts **all comment metadata** from `.docx` files, with full Python bindings via pybind11.
4+
5+
## Features
6+
7+
| Feature | Details |
8+
|---|---|
9+
| Comment fields | id, author, date, initials, full text, paragraph style |
10+
| Anchoring | referenced document text (via `commentRangeStart/End`) |
11+
| Threading | parent/reply relationships (OOXML 2016+ `commentsExtended.xml`) |
12+
| Resolution | `done` flag, earliest/latest dates, per-author filtering |
13+
| Batch parsing | Thread-pool with configurable parallelism |
14+
| Memory | ZIP entries inflated one-at-a-time; SAX for document body; no full DOM |
15+
| Dependencies | libxml2, zlib (standard on all major platforms) |
16+
| Python | pybind11 extension module, GIL released during batch parsing |
17+
18+
---
19+
20+
## Building
21+
22+
### Prerequisites
23+
24+
**Linux / macOS**
25+
```bash
26+
sudo apt install libxml2-dev zlib1g-dev # Debian/Ubuntu
27+
brew install libxml2 zlib # macOS
28+
pip install pybind11 cmake
29+
```
30+
31+
**Windows**
32+
Install [vcpkg](https://github.com/microsoft/vcpkg) then:
33+
```powershell
34+
vcpkg install libxml2 zlib pybind11
35+
```
36+
37+
### CMake (recommended)
38+
39+
```bash
40+
cmake -B build -DCMAKE_BUILD_TYPE=Release
41+
cmake --build build -j$(nproc)
42+
# Optionally run tests:
43+
cd build && ctest --output-on-failure
44+
```
45+
46+
This produces:
47+
- `build/libdocx_comment_parser.so` (Linux) / `.dylib` (macOS) / `.dll` (Windows)
48+
- `build/_docx_comment_parser*.so` – Python extension
49+
50+
### pip (Python only)
51+
52+
```bash
53+
pip install pybind11
54+
pip install .
55+
```
56+
57+
---
58+
59+
## Python Usage
60+
61+
```python
62+
import docx_comment_parser as dcp
63+
64+
# ── Single file ──────────────────────────────────────────────────────────────
65+
parser = dcp.DocxParser()
66+
parser.parse("report.docx")
67+
68+
for c in parser.comments():
69+
print(f"[{c.id}] {c.author} ({c.date}): {c.text[:80]}")
70+
if c.referenced_text:
71+
print(f" ↳ anchored to: '{c.referenced_text[:60]}'")
72+
if c.is_reply:
73+
print(f" ↳ reply to comment #{c.parent_id}")
74+
75+
# Filter by author
76+
for c in parser.by_author("Alice"):
77+
print(c.to_dict())
78+
79+
# Get full thread for a root comment
80+
for c in parser.thread(0):
81+
indent = " " if c.is_reply else ""
82+
print(f"{indent}[{c.id}] {c.author}: {c.text}")
83+
84+
# Stats
85+
s = parser.stats()
86+
print(f"Total: {s.total_comments}, Authors: {s.unique_authors}")
87+
print(f"Date range: {s.earliest_date}{s.latest_date}")
88+
89+
# ── Batch (parallel) ─────────────────────────────────────────────────────────
90+
import glob
91+
92+
bp = dcp.BatchParser(max_threads=0) # 0 = auto
93+
files = glob.glob("/documents/**/*.docx", recursive=True)
94+
bp.parse_all(files)
95+
96+
for f in files:
97+
if f in bp.errors():
98+
print(f"ERROR {f}: {bp.errors()[f]}")
99+
continue
100+
s = bp.stats(f)
101+
print(f"{f}: {s.total_comments} comments by {len(s.unique_authors)} authors")
102+
103+
bp.release_all() # free memory
104+
```
105+
106+
---
107+
108+
## C++ Usage
109+
110+
```cpp
111+
#include "docx_comment_parser.h"
112+
113+
// Single file
114+
docx::DocxParser parser;
115+
parser.parse("report.docx");
116+
117+
for (const auto& c : parser.comments()) {
118+
std::cout << c.id << " | " << c.author << " | " << c.text << "\n";
119+
}
120+
121+
// Batch
122+
docx::BatchParser bp(/*threads=*/4);
123+
bp.parse_all({"a.docx", "b.docx", "c.docx"});
124+
for (const auto& [path, err] : bp.errors())
125+
std::cerr << "Failed: " << path << ": " << err << "\n";
126+
bp.release_all();
127+
```
128+
129+
---
130+
131+
## CommentMetadata fields
132+
133+
| Field | Type | Source |
134+
|---|---|---|
135+
| `id` | `int` | `w:id` attribute |
136+
| `author` | `str` | `w:author` |
137+
| `date` | `str` | `w:date` (ISO-8601) |
138+
| `initials` | `str` | `w:initials` |
139+
| `text` | `str` | Full plain-text of comment body |
140+
| `paragraph_style` | `str` | Style of first paragraph in comment |
141+
| `referenced_text` | `str` | Document text anchored by this comment |
142+
| `is_reply` | `bool` | True if this is a threaded reply |
143+
| `parent_id` | `int` | id of parent comment (-1 if root) |
144+
| `replies` | `list[CommentRef]` | Direct replies (populated on parent) |
145+
| `para_id` | `str` | OOXML 2016+ paragraph ID |
146+
| `para_id_parent` | `str` | Parent paragraph ID (before id resolution) |
147+
| `done` | `bool` | Resolved/done flag (`commentsExtended.xml`) |
148+
| `thread_ids` | `list[int]` | Ordered ids in this thread (root only) |
149+
| `paragraph_index` | `int` | 0-based paragraph in document body |
150+
| `run_index` | `int` | 0-based run within paragraph |
151+
152+
---
153+
154+
## Architecture
155+
156+
```
157+
docx_comment_parser/
158+
├── include/
159+
│ ├── docx_comment_parser.h # Public API (CommentMetadata, DocxParser, BatchParser)
160+
│ ├── zip_reader.h # ZIP reader interface (zlib only, no libzip)
161+
│ └── xml_utils.h # Lightweight libxml2 helpers
162+
├── src/
163+
│ ├── zip_reader.cpp # Memory-mapped ZIP + inflate
164+
│ ├── docx_parser.cpp # Core: comments.xml (DOM) + document.xml (SAX)
165+
│ └── batch_parser.cpp # Thread-pool batch processing
166+
├── python/
167+
│ └── python_bindings.cpp # pybind11 module
168+
├── tests/
169+
│ └── test_docx_parser.cpp # Self-contained test suite
170+
├── CMakeLists.txt
171+
└── setup.py
172+
```
173+
174+
### Memory model
175+
176+
- **ZIP entries** are memory-mapped and inflated one at a time; no entry's data is kept in memory while another is being read.
177+
- **`comments.xml`** is parsed with libxml2 DOM (typically < 100 KB).
178+
- **`document.xml`** (which can be very large) is streamed with libxml2 SAX2; only the anchor text accumulator is kept in memory.
179+
- **BatchParser** runs one `DocxParser` per thread; results can be individually `release()`d to reclaim memory after use.
180+
181+
---
182+
183+
## License
184+
185+
MIT

0 commit comments

Comments
 (0)