Skip to content

Commit a2ca875

Browse files
author
sebastien.bouvard
committed
QPR-13312 Schema check python script for ORE
1 parent 324d5bb commit a2ca875

1 file changed

Lines changed: 210 additions & 0 deletions

File tree

xsd/check.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import logging
2+
import os
3+
import argparse
4+
from lxml import etree
5+
from lxml.etree import XMLSchemaParseError
6+
from lxml.etree import XMLSyntaxError
7+
import sys
8+
from typing import Iterable, List, Optional
9+
10+
# Global result flag: True means no schema errors encountered; set to False on any schema error
11+
ERROR: bool = True
12+
13+
14+
def remove_temp_files(dir_name:str) -> None:
15+
"""
16+
This function removes temporary xsd files created during the validation process.
17+
If no errors occurred (ERROR is True), also remove check.log.
18+
"""
19+
20+
# Remove check.log only if there were no schema errors
21+
if ERROR:
22+
check_log_path = os.path.join(dir_name, 'check.log')
23+
if os.path.exists(check_log_path):
24+
os.remove(check_log_path)
25+
26+
def iter_xml_files(root_dir: str):
27+
for dirpath, _, filenames in os.walk(root_dir):
28+
for fname in filenames:
29+
if fname.lower().endswith(".xml"):
30+
yield os.path.join(dirpath, fname)
31+
32+
33+
def xml_validator(xml_path: str, schema: etree.XMLSchema) -> None:
34+
"""
35+
Validate a single XML file against a precompiled XML schema.
36+
37+
:param xml_path: Filesystem path to the XML file.
38+
:type xml_path: str
39+
:param schema: Precompiled lxml XMLSchema object.
40+
:type schema: etree.XMLSchema
41+
"""
42+
global ERROR
43+
44+
if "invalid_calendaradjustments" in str(xml_path):
45+
logging.info(f"Skipping validation for {xml_path} (name contains 'invalid_calendaradjustments')")
46+
return
47+
48+
try:
49+
# Parse the XML (no schema-bound parser creation per file)
50+
with open(xml_path, 'rb') as xml_file:
51+
doc = etree.parse(xml_file)
52+
# Assert validity against the compiled schema
53+
schema.assertValid(doc)
54+
except XMLSyntaxError as e:
55+
ERROR = False
56+
logging.error(f"XMLSyntaxError in {xml_path}: {str(e)}")
57+
except etree.DocumentInvalid as e:
58+
ERROR = False
59+
logging.error(f"Schema validation failed for {xml_path}: {str(e)}")
60+
except Exception as e:
61+
ERROR = False
62+
logging.error(f"Unexpected error validating {xml_path}: {e}")
63+
64+
65+
def _compile_schema_once(dir_name: str) -> Optional[etree.XMLSchema]:
66+
"""Preprocess XSDs, write temp files under script directory, and compile schema once."""
67+
global ERROR
68+
xsd_path = os.path.join(dir_name, 'input.xsd')
69+
# Compile the schema once
70+
try:
71+
if not os.path.isfile(xsd_path):
72+
ERROR = False
73+
logging.error(f"Schema file not found at: {xsd_path}")
74+
return None
75+
# Parse the XSD file from disk; do not treat the path as XML content
76+
schema_doc = etree.parse(xsd_path)
77+
schema = etree.XMLSchema(schema_doc)
78+
return schema
79+
except XMLSchemaParseError as e:
80+
ERROR = False
81+
logging.error(f"XMLSchemaParseError while compiling schema from '{xsd_path}': {e}")
82+
return None
83+
except XMLSyntaxError as e:
84+
ERROR = False
85+
logging.error(f"XMLSyntaxError while parsing schema file '{xsd_path}': {e}")
86+
return None
87+
except Exception as e:
88+
ERROR = False
89+
logging.error(f"Unexpected error compiling schema from '{xsd_path}': {e}")
90+
return None
91+
92+
93+
def _collect_xmls(paths: Iterable[str]) -> List[str]:
94+
files: List[str] = []
95+
for p in paths:
96+
if os.path.isdir(p):
97+
files.extend(iter_xml_files(p))
98+
return files
99+
100+
101+
def _setup_logging(log_level: str, dir_name: str) -> None:
102+
"""Write detailed logs to check.log; only ERRORs to the console."""
103+
file_level = getattr(logging, log_level.upper(), logging.DEBUG)
104+
105+
logger = logging.getLogger()
106+
# Allow file handler to capture everything up to DEBUG
107+
logger.setLevel(logging.DEBUG)
108+
109+
# Remove any existing handlers to avoid duplicate logs
110+
for h in list(logger.handlers):
111+
logger.removeHandler(h)
112+
113+
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
114+
115+
# Console handler (ERROR only)
116+
ch = logging.StreamHandler(stream=sys.stderr)
117+
ch.setLevel(logging.ERROR)
118+
ch.setFormatter(formatter)
119+
logger.addHandler(ch)
120+
121+
# File handler writing logs to check.log at configured level
122+
check_log_path = os.path.join(dir_name, 'check.log')
123+
fh = logging.FileHandler(check_log_path)
124+
fh.setLevel(file_level)
125+
fh.setFormatter(formatter)
126+
logger.addHandler(fh)
127+
128+
129+
def main(xml_path:str, jobs:int=1) -> None:
130+
"""
131+
Main function to initiate XML schema validation for configured files.
132+
133+
It constructs the necessary file paths, iterates over XML files in designated directories,
134+
and validates each file using the ``xml_validator`` function.
135+
136+
:returns: None
137+
:rtype: None
138+
"""
139+
logging.info("Start Schema Check")
140+
dir_name = os.path.abspath(os.path.dirname(__file__))
141+
142+
dict_paths = {
143+
"Examples": os.path.join(dir_name, '..', 'Examples'),
144+
"RegressionTest": os.path.join(dir_name, '..', 'RegressionTests'),
145+
"OREData": os.path.join(dir_name, '..', 'OREData', 'test')
146+
}
147+
148+
# Preprocess and compile schema once
149+
schema = _compile_schema_once(dir_name)
150+
if schema is None:
151+
logging.error("Schema compilation failed; aborting.")
152+
remove_temp_files(dir_name)
153+
sys.exit(1)
154+
155+
# If a specific XML file is provided, validate it directly
156+
if xml_path and os.path.isfile(xml_path):
157+
logging.info(f"Validate {xml_path}")
158+
xml_validator(xml_path, schema)
159+
remove_temp_files(dir_name)
160+
sys.exit(0)
161+
162+
# Collect all XML files to validate
163+
paths_to_scan = list(dict_paths.values())
164+
xml_files = _collect_xmls(paths_to_scan)
165+
logging.info(f"Found {len(xml_files)} XML files to validate")
166+
167+
# Optional parallel validation using threads (safe for IO-bound parsing)
168+
# Using threads avoids schema pickling problems; lxml releases GIL on I/O and parsing.
169+
if jobs and jobs > 1:
170+
try:
171+
from concurrent.futures import ThreadPoolExecutor, as_completed
172+
with ThreadPoolExecutor(max_workers=jobs) as executor:
173+
futures = {executor.submit(xml_validator, xf, schema): xf for xf in xml_files}
174+
for future in as_completed(futures):
175+
# Drain exceptions to ensure they get logged
176+
try:
177+
future.result()
178+
except Exception as e:
179+
# Any exception surfaced here means at least one validation failed
180+
global ERROR
181+
ERROR = False
182+
logging.error(f"Validation task failed: {e}")
183+
except Exception as e:
184+
logging.warning(f"Parallel validation setup failed ({e}); falling back to sequential.")
185+
for xf in xml_files:
186+
xml_validator(xf, schema)
187+
else:
188+
for xf in xml_files:
189+
xml_validator(xf, schema)
190+
191+
# Remove the temporary xsd files and conditionally remove check.log
192+
remove_temp_files(dir_name)
193+
194+
195+
if __name__ == '__main__':
196+
parser = argparse.ArgumentParser(description='Check if xml file is valid')
197+
# Default file logging level to DEBUG for comprehensive trace in check.log
198+
parser.add_argument('--log_level', type=str, default="INFO")
199+
parser.add_argument('--xml_path', type=str, default="", help="Direct XML path")
200+
parser.add_argument('--jobs', type=int, default=2, help="Number of parallel validation workers (threads)")
201+
args = parser.parse_args()
202+
203+
# Prepare logging (console ERROR-only, file full details) before running main
204+
script_dir = os.path.abspath(os.path.dirname(__file__))
205+
_setup_logging(args.log_level, script_dir)
206+
207+
main(xml_path=args.xml_path, jobs=args.jobs)
208+
209+
if not ERROR:
210+
sys.exit(-1)

0 commit comments

Comments
 (0)