Skip to content

Commit 2e8f2dc

Browse files
committed
add script scripts/remove_gremlins.py
1 parent 44e8cd4 commit 2e8f2dc

File tree

1 file changed

+224
-0
lines changed

1 file changed

+224
-0
lines changed

scripts/remove_gremlins.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import re
4+
import shutil
5+
import sys
6+
import unicodedata
7+
from pathlib import Path
8+
from tempfile import NamedTemporaryFile
9+
10+
# ----- Patterns & Tables -----
11+
12+
# Zero-width and bidi control junk to remove
13+
ZERO_WIDTH_BIDI_RE = re.compile(
14+
"[" # open char class
15+
"\u200B-\u200D" # zero-width space/joiner/non-joiner
16+
"\u2060" # word joiner
17+
"\u200E\u200F" # LTR/RTL marks
18+
"\u202A-\u202E" # bidi embedding/override
19+
"\u061C" # Arabic Letter Mark
20+
"\uFEFF" # BOM
21+
"]"
22+
)
23+
24+
# NBSP-like spaces to turn into a normal ASCII space
25+
NBSP_RE = re.compile("[\u00A0\u202F\u180E]") # NBSP, narrow NBSP, deprecated Mongolian vowel sep
26+
27+
# Common typographic → ASCII replacements (leave French « » intact)
28+
TYPO_MAP = {
29+
"\u2010": "-", # hyphen
30+
"\u2011": "-", # non-breaking hyphen
31+
"\u2012": "-", # figure dash
32+
"\u2013": "-", # en dash
33+
"\u2014": "-", # em dash
34+
"\u2212": "-", # minus sign
35+
36+
"\u2018": "'", # left single quotation mark
37+
"\u2019": "'", # right single quotation mark / apostrophe
38+
"\u201B": "'", # single high-reversed-9 quotation mark
39+
"\u2032": "'", # prime (often used as apostrophe)
40+
41+
"\u201C": '"', # left double quotation mark
42+
"\u201D": '"', # right double quotation mark
43+
"\u201F": '"', # double high-reversed-9 quotation mark
44+
"\u2033": '"', # double prime
45+
46+
"\u2026": "...", # ellipsis
47+
}
48+
49+
# Build fast translate table for single-char replacements
50+
TRANSLATE_TABLE = str.maketrans(TYPO_MAP)
51+
52+
# Heuristic markers that commonly show up in UTF-8→Latin-1 mojibake
53+
MOJIBAKE_MARKERS = ("Ã", "Â", "â", "€", "œ", "�")
54+
55+
56+
# ----- Helpers -----
57+
58+
def is_binary(path: Path, probe_size: int = 4096) -> bool:
59+
"""
60+
Very conservative binary detection:
61+
- Only treat as binary if there is a NUL byte in the first probe_size bytes.
62+
- Do NOT try to decode as UTF-8 here (to avoid false positives on non-UTF-8 text).
63+
"""
64+
try:
65+
with open(path, "rb") as f:
66+
chunk = f.read(probe_size)
67+
return b"\x00" in chunk
68+
except Exception:
69+
# On any I/O error, be safe and treat as binary/skip
70+
return True
71+
72+
73+
def looks_like_mojibake(text: str) -> bool:
74+
return any(m in text for m in MOJIBAKE_MARKERS)
75+
76+
77+
def mojibake_score(text: str) -> int:
78+
return sum(text.count(m) for m in MOJIBAKE_MARKERS)
79+
80+
81+
def try_fix_mojibake(text: str) -> str:
82+
"""
83+
Attempt to repair common UTF-8 mojibake where UTF-8 bytes were decoded as Latin-1
84+
BEFORE being written to the file (so the file now contains the mojibake).
85+
We only apply this if it clearly improves the text (reduces mojibake markers).
86+
"""
87+
if not looks_like_mojibake(text):
88+
return text
89+
90+
try:
91+
encoded = text.encode("latin-1", errors="strict")
92+
repaired = encoded.decode("utf-8", errors="strict")
93+
if mojibake_score(repaired) < mojibake_score(text):
94+
return repaired
95+
return text
96+
except Exception:
97+
return text
98+
99+
100+
def normalize_text_block(s: str) -> str:
101+
# Fix mojibake first (works better before other rules)
102+
s = try_fix_mojibake(s)
103+
104+
# Normalize to composed form (keep accents, avoid compatibility folding)
105+
s = unicodedata.normalize("NFC", s)
106+
107+
# Remove zero-width/bidi controls
108+
s = ZERO_WIDTH_BIDI_RE.sub("", s)
109+
110+
# Convert NBSP-like to a normal ASCII space
111+
s = NBSP_RE.sub(" ", s)
112+
113+
# Replace typographic gremlins with ASCII equivalents (quotes, dashes, ellipsis)
114+
s = s.translate(TRANSLATE_TABLE)
115+
116+
# Keep French guillemets « » as-is; do NOT touch accents
117+
return s
118+
119+
120+
# ----- Processing -----
121+
122+
def process_file(path: Path, *, force_text: bool = False) -> None:
123+
"""
124+
Process a single file.
125+
- If force_text=True, skip the binary (NUL) check and always try to read as UTF-8.
126+
"""
127+
print(f"[FOUND] {path}")
128+
129+
if not force_text and is_binary(path):
130+
print(" -> Skipped (binary file)")
131+
return
132+
133+
try:
134+
# Read full text to allow safe mojibake detection and normalization
135+
with open(path, "r", encoding="utf-8", errors="strict") as fin:
136+
text = fin.read()
137+
except UnicodeDecodeError:
138+
print(" -> Skipped (encoding not UTF-8)")
139+
return
140+
except Exception:
141+
print(" -> Skipped (read error)")
142+
return
143+
144+
normalized = normalize_text_block(text)
145+
if normalized != text:
146+
try:
147+
# Atomic replace using a temp file in the same directory
148+
with NamedTemporaryFile(
149+
"w", delete=False, encoding="utf-8", dir=str(path.parent)
150+
) as fout:
151+
fout.write(normalized)
152+
temp_name = fout.name
153+
shutil.move(temp_name, path)
154+
print(" -> ✅ FIXED")
155+
except Exception:
156+
print(" -> Skipped (write error)")
157+
else:
158+
print(" -> OK (clean)")
159+
160+
161+
def main():
162+
parser = argparse.ArgumentParser(
163+
description=(
164+
"Recursive Unicode normalizer for text files.\n"
165+
"- Fixes dashes/quotes/ellipsis to ASCII punctuation\n"
166+
"- Repairs French accents (NFC) and common UTF-8 mojibake\n"
167+
"- Removes zero-width/BOM/bidi controls\n"
168+
"- Converts NBSP/narrow NBSP to normal space\n"
169+
"- Keeps « » and French letters intact\n\n"
170+
"Usage:\n"
171+
" script.py /path/to/file.md\n"
172+
" script.py /path/to/folder .md"
173+
)
174+
)
175+
parser.add_argument(
176+
"target",
177+
help="File OR folder to scan. If file, only that file is processed. If folder, scan recursively.",
178+
)
179+
parser.add_argument(
180+
"extension",
181+
nargs="?",
182+
help="File extension to match when target is a folder (e.g. .txt, .md, .csv)",
183+
)
184+
args = parser.parse_args()
185+
186+
target = Path(args.target)
187+
188+
# --- Single-file mode ---
189+
if target.is_file():
190+
# User explicitly pointed at this file: trust them, don't NUL-check
191+
process_file(target, force_text=True)
192+
return
193+
194+
# --- Folder mode ---
195+
if not target.exists():
196+
print(f"Target not found: {target}", file=sys.stderr)
197+
sys.exit(1)
198+
199+
if not target.is_dir():
200+
print(
201+
f"Target is neither a regular file nor a directory: {target}",
202+
file=sys.stderr,
203+
)
204+
sys.exit(1)
205+
206+
if not args.extension:
207+
print(
208+
"When target is a folder, you must specify an extension (e.g. .md, .txt).",
209+
file=sys.stderr,
210+
)
211+
sys.exit(1)
212+
213+
ext = args.extension.lower()
214+
if not ext.startswith("."):
215+
ext = "." + ext
216+
217+
for p in target.rglob("*"):
218+
if p.is_file() and p.suffix.lower() == ext and not p.is_symlink():
219+
# In folder mode, still protect against true binary files via NUL check
220+
process_file(p, force_text=False)
221+
222+
223+
if __name__ == "__main__":
224+
main()

0 commit comments

Comments
 (0)