Skip to content

Commit a09f9ac

Browse files
committed
Add roast/scanner.py
1 parent 23dc8c4 commit a09f9ac

1 file changed

Lines changed: 146 additions & 0 deletions

File tree

roast/scanner.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""Repository scanning utilities."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass
6+
from pathlib import Path
7+
import logging
8+
from typing import Iterable
9+
10+
LOGGER = logging.getLogger(__name__)
11+
12+
SKIP_DIRS = {"node_modules", ".git", "__pycache__", "dist", "build", "venv", ".venv"}
13+
CONFIG_EXTENSIONS = {
14+
".json",
15+
".yaml",
16+
".yml",
17+
".toml",
18+
".ini",
19+
".cfg",
20+
".md",
21+
}
22+
23+
LANGUAGE_BY_EXTENSION = {
24+
".py": "python",
25+
".js": "javascript",
26+
".ts": "typescript",
27+
".jsx": "javascript",
28+
".tsx": "typescript",
29+
".java": "java",
30+
".go": "go",
31+
".rs": "rust",
32+
".rb": "ruby",
33+
".php": "php",
34+
".c": "c",
35+
".cpp": "cpp",
36+
".h": "c-header",
37+
}
38+
39+
40+
@dataclass(slots=True)
41+
class FileResult:
42+
path: str
43+
content: str
44+
language: str
45+
line_count: int
46+
47+
48+
def _normalize_extensions(extensions: Iterable[str]) -> set[str]:
49+
normalized: set[str] = set()
50+
for ext in extensions:
51+
clean = ext.strip().lower()
52+
if not clean:
53+
continue
54+
if not clean.startswith("."):
55+
clean = f".{clean}"
56+
normalized.add(clean)
57+
return normalized
58+
59+
60+
def _is_binary_file(path: Path) -> bool:
61+
try:
62+
with path.open("rb") as fh:
63+
chunk = fh.read(4096)
64+
except OSError:
65+
return True
66+
return b"\x00" in chunk
67+
68+
69+
def _infer_language(path: Path) -> str:
70+
ext = path.suffix.lower()
71+
return LANGUAGE_BY_EXTENSION.get(ext, ext.lstrip(".") or "text")
72+
73+
74+
def _is_config_file(path: Path) -> bool:
75+
if path.suffix.lower() in CONFIG_EXTENSIONS:
76+
return True
77+
return path.name.lower() in {
78+
"package-lock.json",
79+
"yarn.lock",
80+
"pnpm-lock.yaml",
81+
"poetry.lock",
82+
"pylintrc",
83+
"eslint.config.js",
84+
}
85+
86+
87+
def _should_skip_path(path: Path) -> bool:
88+
lowered_parts = {part.lower() for part in path.parts}
89+
if lowered_parts & SKIP_DIRS:
90+
return True
91+
name = path.name.lower()
92+
return name == ".env" or name.startswith(".env.")
93+
94+
95+
def scan_repo(path: str | Path, extensions: Iterable[str], max_files: int) -> list[FileResult]:
96+
"""Scan a repository path and return parsed source files."""
97+
root = Path(path).expanduser().resolve()
98+
ext_filter = _normalize_extensions(extensions)
99+
candidates: list[tuple[int, Path]] = []
100+
101+
for file_path in root.rglob("*"):
102+
if not file_path.is_file():
103+
continue
104+
if _should_skip_path(file_path.relative_to(root)):
105+
continue
106+
if ext_filter and file_path.suffix.lower() not in ext_filter:
107+
continue
108+
priority = 1 if _is_config_file(file_path) else 0
109+
candidates.append((priority, file_path))
110+
111+
candidates.sort(key=lambda item: (item[0], str(item[1]).lower()))
112+
results: list[FileResult] = []
113+
114+
for _, file_path in candidates:
115+
if len(results) >= max_files:
116+
break
117+
relative_path = file_path.relative_to(root)
118+
119+
if _is_binary_file(file_path):
120+
LOGGER.warning("Skipping binary file: %s", relative_path)
121+
continue
122+
123+
try:
124+
content = file_path.read_text(encoding="utf-8")
125+
except UnicodeDecodeError:
126+
LOGGER.warning("Skipping unreadable file (encoding): %s", relative_path)
127+
continue
128+
except OSError as exc:
129+
LOGGER.warning("Skipping unreadable file (%s): %s", exc.__class__.__name__, relative_path)
130+
continue
131+
132+
line_count = content.count("\n") + (1 if content else 0)
133+
if line_count > 500:
134+
LOGGER.warning("Skipping too large to roast (>500 lines): %s", relative_path)
135+
continue
136+
137+
results.append(
138+
FileResult(
139+
path=str(relative_path),
140+
content=content,
141+
language=_infer_language(file_path),
142+
line_count=line_count,
143+
)
144+
)
145+
146+
return results

0 commit comments

Comments
 (0)