|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse |
| 3 | +import re |
| 4 | +import shutil |
| 5 | +import sys |
| 6 | +import unicodedata |
| 7 | +from pathlib import Path |
| 8 | +from tempfile import NamedTemporaryFile |
| 9 | + |
| 10 | +# ----- Patterns & Tables ----- |
| 11 | + |
| 12 | +# Zero-width and bidi control junk to remove |
| 13 | +ZERO_WIDTH_BIDI_RE = re.compile( |
| 14 | + "[" # open char class |
| 15 | + "\u200B-\u200D" # zero-width space/joiner/non-joiner |
| 16 | + "\u2060" # word joiner |
| 17 | + "\u200E\u200F" # LTR/RTL marks |
| 18 | + "\u202A-\u202E" # bidi embedding/override |
| 19 | + "\u061C" # Arabic Letter Mark |
| 20 | + "\uFEFF" # BOM |
| 21 | + "]" |
| 22 | +) |
| 23 | + |
| 24 | +# NBSP-like spaces to turn into a normal ASCII space |
| 25 | +NBSP_RE = re.compile("[\u00A0\u202F\u180E]") # NBSP, narrow NBSP, deprecated Mongolian vowel sep |
| 26 | + |
| 27 | +# Common typographic → ASCII replacements (leave French « » intact) |
| 28 | +TYPO_MAP = { |
| 29 | + "\u2010": "-", # hyphen |
| 30 | + "\u2011": "-", # non-breaking hyphen |
| 31 | + "\u2012": "-", # figure dash |
| 32 | + "\u2013": "-", # en dash |
| 33 | + "\u2014": "-", # em dash |
| 34 | + "\u2212": "-", # minus sign |
| 35 | + |
| 36 | + "\u2018": "'", # left single quotation mark |
| 37 | + "\u2019": "'", # right single quotation mark / apostrophe |
| 38 | + "\u201B": "'", # single high-reversed-9 quotation mark |
| 39 | + "\u2032": "'", # prime (often used as apostrophe) |
| 40 | + |
| 41 | + "\u201C": '"', # left double quotation mark |
| 42 | + "\u201D": '"', # right double quotation mark |
| 43 | + "\u201F": '"', # double high-reversed-9 quotation mark |
| 44 | + "\u2033": '"', # double prime |
| 45 | + |
| 46 | + "\u2026": "...", # ellipsis |
| 47 | +} |
| 48 | + |
| 49 | +# Build fast translate table for single-char replacements |
| 50 | +TRANSLATE_TABLE = str.maketrans(TYPO_MAP) |
| 51 | + |
| 52 | +# Heuristic markers that commonly show up in UTF-8→Latin-1 mojibake |
| 53 | +MOJIBAKE_MARKERS = ("Ã", "Â", "â", "€", "œ", "�") |
| 54 | + |
| 55 | + |
| 56 | +# ----- Helpers ----- |
| 57 | + |
| 58 | +def is_binary(path: Path, probe_size: int = 4096) -> bool: |
| 59 | + """ |
| 60 | + Very conservative binary detection: |
| 61 | + - Only treat as binary if there is a NUL byte in the first probe_size bytes. |
| 62 | + - Do NOT try to decode as UTF-8 here (to avoid false positives on non-UTF-8 text). |
| 63 | + """ |
| 64 | + try: |
| 65 | + with open(path, "rb") as f: |
| 66 | + chunk = f.read(probe_size) |
| 67 | + return b"\x00" in chunk |
| 68 | + except Exception: |
| 69 | + # On any I/O error, be safe and treat as binary/skip |
| 70 | + return True |
| 71 | + |
| 72 | + |
| 73 | +def looks_like_mojibake(text: str) -> bool: |
| 74 | + return any(m in text for m in MOJIBAKE_MARKERS) |
| 75 | + |
| 76 | + |
| 77 | +def mojibake_score(text: str) -> int: |
| 78 | + return sum(text.count(m) for m in MOJIBAKE_MARKERS) |
| 79 | + |
| 80 | + |
| 81 | +def try_fix_mojibake(text: str) -> str: |
| 82 | + """ |
| 83 | + Attempt to repair common UTF-8 mojibake where UTF-8 bytes were decoded as Latin-1 |
| 84 | + BEFORE being written to the file (so the file now contains the mojibake). |
| 85 | + We only apply this if it clearly improves the text (reduces mojibake markers). |
| 86 | + """ |
| 87 | + if not looks_like_mojibake(text): |
| 88 | + return text |
| 89 | + |
| 90 | + try: |
| 91 | + encoded = text.encode("latin-1", errors="strict") |
| 92 | + repaired = encoded.decode("utf-8", errors="strict") |
| 93 | + if mojibake_score(repaired) < mojibake_score(text): |
| 94 | + return repaired |
| 95 | + return text |
| 96 | + except Exception: |
| 97 | + return text |
| 98 | + |
| 99 | + |
| 100 | +def normalize_text_block(s: str) -> str: |
| 101 | + # Fix mojibake first (works better before other rules) |
| 102 | + s = try_fix_mojibake(s) |
| 103 | + |
| 104 | + # Normalize to composed form (keep accents, avoid compatibility folding) |
| 105 | + s = unicodedata.normalize("NFC", s) |
| 106 | + |
| 107 | + # Remove zero-width/bidi controls |
| 108 | + s = ZERO_WIDTH_BIDI_RE.sub("", s) |
| 109 | + |
| 110 | + # Convert NBSP-like to a normal ASCII space |
| 111 | + s = NBSP_RE.sub(" ", s) |
| 112 | + |
| 113 | + # Replace typographic gremlins with ASCII equivalents (quotes, dashes, ellipsis) |
| 114 | + s = s.translate(TRANSLATE_TABLE) |
| 115 | + |
| 116 | + # Keep French guillemets « » as-is; do NOT touch accents |
| 117 | + return s |
| 118 | + |
| 119 | + |
| 120 | +# ----- Processing ----- |
| 121 | + |
| 122 | +def process_file(path: Path, *, force_text: bool = False) -> None: |
| 123 | + """ |
| 124 | + Process a single file. |
| 125 | + - If force_text=True, skip the binary (NUL) check and always try to read as UTF-8. |
| 126 | + """ |
| 127 | + print(f"[FOUND] {path}") |
| 128 | + |
| 129 | + if not force_text and is_binary(path): |
| 130 | + print(" -> Skipped (binary file)") |
| 131 | + return |
| 132 | + |
| 133 | + try: |
| 134 | + # Read full text to allow safe mojibake detection and normalization |
| 135 | + with open(path, "r", encoding="utf-8", errors="strict") as fin: |
| 136 | + text = fin.read() |
| 137 | + except UnicodeDecodeError: |
| 138 | + print(" -> Skipped (encoding not UTF-8)") |
| 139 | + return |
| 140 | + except Exception: |
| 141 | + print(" -> Skipped (read error)") |
| 142 | + return |
| 143 | + |
| 144 | + normalized = normalize_text_block(text) |
| 145 | + if normalized != text: |
| 146 | + try: |
| 147 | + # Atomic replace using a temp file in the same directory |
| 148 | + with NamedTemporaryFile( |
| 149 | + "w", delete=False, encoding="utf-8", dir=str(path.parent) |
| 150 | + ) as fout: |
| 151 | + fout.write(normalized) |
| 152 | + temp_name = fout.name |
| 153 | + shutil.move(temp_name, path) |
| 154 | + print(" -> ✅ FIXED") |
| 155 | + except Exception: |
| 156 | + print(" -> Skipped (write error)") |
| 157 | + else: |
| 158 | + print(" -> OK (clean)") |
| 159 | + |
| 160 | + |
| 161 | +def main(): |
| 162 | + parser = argparse.ArgumentParser( |
| 163 | + description=( |
| 164 | + "Recursive Unicode normalizer for text files.\n" |
| 165 | + "- Fixes dashes/quotes/ellipsis to ASCII punctuation\n" |
| 166 | + "- Repairs French accents (NFC) and common UTF-8 mojibake\n" |
| 167 | + "- Removes zero-width/BOM/bidi controls\n" |
| 168 | + "- Converts NBSP/narrow NBSP to normal space\n" |
| 169 | + "- Keeps « » and French letters intact\n\n" |
| 170 | + "Usage:\n" |
| 171 | + " script.py /path/to/file.md\n" |
| 172 | + " script.py /path/to/folder .md" |
| 173 | + ) |
| 174 | + ) |
| 175 | + parser.add_argument( |
| 176 | + "target", |
| 177 | + help="File OR folder to scan. If file, only that file is processed. If folder, scan recursively.", |
| 178 | + ) |
| 179 | + parser.add_argument( |
| 180 | + "extension", |
| 181 | + nargs="?", |
| 182 | + help="File extension to match when target is a folder (e.g. .txt, .md, .csv)", |
| 183 | + ) |
| 184 | + args = parser.parse_args() |
| 185 | + |
| 186 | + target = Path(args.target) |
| 187 | + |
| 188 | + # --- Single-file mode --- |
| 189 | + if target.is_file(): |
| 190 | + # User explicitly pointed at this file: trust them, don't NUL-check |
| 191 | + process_file(target, force_text=True) |
| 192 | + return |
| 193 | + |
| 194 | + # --- Folder mode --- |
| 195 | + if not target.exists(): |
| 196 | + print(f"Target not found: {target}", file=sys.stderr) |
| 197 | + sys.exit(1) |
| 198 | + |
| 199 | + if not target.is_dir(): |
| 200 | + print( |
| 201 | + f"Target is neither a regular file nor a directory: {target}", |
| 202 | + file=sys.stderr, |
| 203 | + ) |
| 204 | + sys.exit(1) |
| 205 | + |
| 206 | + if not args.extension: |
| 207 | + print( |
| 208 | + "When target is a folder, you must specify an extension (e.g. .md, .txt).", |
| 209 | + file=sys.stderr, |
| 210 | + ) |
| 211 | + sys.exit(1) |
| 212 | + |
| 213 | + ext = args.extension.lower() |
| 214 | + if not ext.startswith("."): |
| 215 | + ext = "." + ext |
| 216 | + |
| 217 | + for p in target.rglob("*"): |
| 218 | + if p.is_file() and p.suffix.lower() == ext and not p.is_symlink(): |
| 219 | + # In folder mode, still protect against true binary files via NUL check |
| 220 | + process_file(p, force_text=False) |
| 221 | + |
| 222 | + |
| 223 | +if __name__ == "__main__": |
| 224 | + main() |
0 commit comments