Created
May 10, 2026 17:51
-
-
Save anonhostpi/bc138130e96aa4bab17a07227bae47c7 to your computer and use it in GitHub Desktop.
Symbol audit & backlog tools — find multi-word symbols, classify, track refactoring progress
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Multi-word symbol scanner for any codebase. | |
| Scans source files for multi-word identifiers (camelCase, snake_case, etc.), | |
| optionally classifies them as internal vs external, and outputs CSVs. | |
| Output: | |
| symbols-audit.csv — all occurrences with origin classification | |
| symbols-internal.csv — internal-only, origin column dropped | |
| Usage: | |
| python3 scripts/symbol-audit.py [--root /path/to/repo] | |
| python3 scripts/symbol-audit.py --exclude kebab,pascal,screaming | |
| python3 scripts/symbol-audit.py --only camel,snake | |
| python3 scripts/symbol-audit.py --ext .ts,.yaml --skip dist,generated | |
| python3 scripts/symbol-audit.py --externals externals.txt --include 'src/**' | |
| python3 scripts/symbol-audit.py --min-words 3 | |
| """ | |
| import argparse | |
| import csv | |
| import fnmatch | |
| import os | |
| import re | |
| import sys | |
| from collections import Counter | |
| # ── Defaults ───────────────────────────────────────────────────────────────── | |
| DEFAULT_SKIP = {".git", "node_modules", "vendor", "dist", ".cache"} | |
| DEFAULT_EXTS = {".ts", ".js", ".tsx", ".jsx", ".py", ".go", ".rs", ".yaml", ".yml"} | |
| # ── Patterns ───────────────────────────────────────────────────────────────── | |
| ALL_PATTERNS = { | |
| "camelCase": re.compile(r'\b[a-z]\w*[A-Z]\w*\b'), | |
| "_camelCase": re.compile(r'\b_[a-z]\w*[A-Z]\w*\b'), | |
| "#camelCase": re.compile(r'#[a-z]\w*[A-Z]\w*\b'), | |
| "PascalCase": re.compile(r'\b(?:[A-Z][a-z][a-zA-Z0-9]*){2,}\b'), | |
| "_PascalCase": re.compile(r'\b_[A-Z]\w*[a-z]\w*\b'), | |
| "#PascalCase": re.compile(r'#[A-Z]\w*[a-z]\w*\b'), | |
| "snake_case": re.compile(r'\b[a-z][a-z0-9]*(?:_[a-z][a-z0-9]*)+\b'), | |
| "_snake_case": re.compile(r'\b_[a-z][a-z0-9]*(?:_[a-z][a-z0-9]*)+\b'), | |
| "#snake_case": re.compile(r'#[a-z][a-z0-9]*(?:_[a-z][a-z0-9]*)+\b'), | |
| "SCREAMING_SNAKE": re.compile(r'\b[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+\b'), | |
| "_SCREAMING_SNAKE": re.compile(r'\b_[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+\b'), | |
| "#SCREAMING_SNAKE": re.compile(r'#[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+\b'), | |
| "kebab-case": re.compile(r'\b[a-z][a-z0-9]*(?:-[a-z0-9]+)+\b'), | |
| } | |
| GROUPS = { | |
| "camel": ["camelCase", "_camelCase", "#camelCase"], | |
| "pascal": ["PascalCase", "_PascalCase", "#PascalCase"], | |
| "snake": ["snake_case", "_snake_case", "#snake_case"], | |
| "screaming": ["SCREAMING_SNAKE", "_SCREAMING_SNAKE", "#SCREAMING_SNAKE"], | |
| "kebab": ["kebab-case"], | |
| } | |
| # ── Helpers ────────────────────────────────────────────────────────────────── | |
| def load_externals(path: str) -> set[str]: | |
| if not path: | |
| return set() | |
| with open(path) as f: | |
| return {line.strip() for line in f if line.strip() and not line.startswith("#")} | |
| def matches_include(rel: str, patterns: list[str]) -> bool: | |
| if not patterns: | |
| return True | |
| return any(fnmatch.fnmatch(rel, p) for p in patterns) | |
| def build_line_index(lines: list[str]) -> list[int]: | |
| pos, idx = 0, [0] | |
| for line in lines: | |
| pos += len(line) | |
| idx.append(pos) | |
| return idx | |
| def line_of(idx: list[int], pos: int) -> int: | |
| lo, hi = 0, len(idx) - 1 | |
| while lo < hi: | |
| mid = (lo + hi + 1) // 2 | |
| if idx[mid] <= pos: | |
| lo = mid | |
| else: | |
| hi = mid - 1 | |
| return lo + 1 | |
| def word_count(sym: str) -> int: | |
| clean = sym.lstrip("#_") | |
| parts = re.findall(r'[A-Z]?[a-z0-9]+|[A-Z]+(?=[A-Z]|$)', clean) | |
| if not parts: | |
| parts = clean.split("_") | |
| return len([p for p in parts if p]) | |
| def resolve_patterns(only, exclude): | |
| if only: | |
| groups = [g.strip() for g in only.split(",")] | |
| names = set() | |
| for g in groups: | |
| names.update(GROUPS.get(g, [g])) | |
| return [(k, v) for k, v in ALL_PATTERNS.items() if k in names] | |
| if exclude: | |
| groups = [g.strip() for g in exclude.split(",")] | |
| names = set() | |
| for g in groups: | |
| names.update(GROUPS.get(g, [g])) | |
| return [(k, v) for k, v in ALL_PATTERNS.items() if k not in names] | |
| return list(ALL_PATTERNS.items()) | |
| # ── Scanner ────────────────────────────────────────────────────────────────── | |
| def scan( | |
| root: str, | |
| patterns: list[tuple[str, re.Pattern]], | |
| skip: set[str], | |
| exts: set[str], | |
| includes: list[str], | |
| externals: set[str], | |
| min_words: int, | |
| ) -> list[tuple]: | |
| rows = [] | |
| for dirpath, dirs, files in os.walk(root): | |
| dirs[:] = [d for d in dirs if d not in skip and not d.startswith(".")] | |
| for fname in files: | |
| fpath = os.path.join(dirpath, fname) | |
| ext = os.path.splitext(fname)[1] | |
| if ext not in exts: | |
| continue | |
| rel = os.path.relpath(fpath, root) | |
| if not matches_include(rel, includes): | |
| continue | |
| try: | |
| lines = open(fpath, errors="replace").readlines() | |
| except Exception: | |
| continue | |
| text = "".join(lines) | |
| idx = build_line_index(lines) | |
| for kind, pat in patterns: | |
| for m in pat.finditer(text): | |
| sym = m.group() | |
| wc = word_count(sym) | |
| if wc < min_words: | |
| continue | |
| origin = "external" if sym in externals else "internal" | |
| rows.append((sym, kind, rel, line_of(idx, m.start()), wc, origin)) | |
| return rows | |
| # ── Main ───────────────────────────────────────────────────────────────────── | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Multi-word symbol scanner") | |
| parser.add_argument("--root", default=".", help="Repository root (default: cwd)") | |
| parser.add_argument("--out", default="/tmp", help="Output directory (default: /tmp)") | |
| parser.add_argument("--only", default=None, | |
| help="Comma-separated pattern groups to include: camel,pascal,snake,screaming,kebab") | |
| parser.add_argument("--exclude", default=None, | |
| help="Comma-separated pattern groups to exclude: camel,pascal,snake,screaming,kebab") | |
| parser.add_argument("--ext", default=None, | |
| help="Comma-separated file extensions to scan (default: .ts,.js,.tsx,.jsx,.py,.go,.rs,.yaml,.yml)") | |
| parser.add_argument("--skip", default=None, | |
| help="Comma-separated directory names to skip (default: .git,node_modules,vendor,dist,.cache)") | |
| parser.add_argument("--include", action="append", default=[], | |
| help="Glob pattern for files to include (can be repeated; default: all files)") | |
| parser.add_argument("--externals", default=None, | |
| help="Path to a file listing external symbols (one per line)") | |
| parser.add_argument("--min-words", type=int, default=2, | |
| help="Minimum word count to report (default: 2)") | |
| args = parser.parse_args() | |
| root = os.path.abspath(args.root) | |
| patterns = resolve_patterns(args.only, args.exclude) | |
| if not patterns: | |
| print("error: no patterns selected", file=sys.stderr) | |
| sys.exit(1) | |
| exts = set(e.strip() if e.startswith(".") else f".{e.strip()}" | |
| for e in args.ext.split(",")) if args.ext else DEFAULT_EXTS | |
| skip = set(d.strip() for d in args.skip.split(",")) if args.skip else DEFAULT_SKIP | |
| externals = load_externals(args.externals) if args.externals else set() | |
| print(f"scanning with: {', '.join(k for k, _ in patterns)}") | |
| if args.include: | |
| print(f"include: {args.include}") | |
| rows = scan(root, patterns, skip, exts, args.include, externals, args.min_words) | |
| rows.sort(key=lambda x: (x[1], x[0].lower())) | |
| # Full audit with origin | |
| audit_path = os.path.join(args.out, "symbols-audit.csv") | |
| with open(audit_path, "w", newline="") as f: | |
| w = csv.writer(f) | |
| w.writerow(["symbol", "kind", "file", "line", "word_count", "origin"]) | |
| w.writerows(rows) | |
| # Internal only, no origin column | |
| internal_path = os.path.join(args.out, "symbols-internal.csv") | |
| with open(internal_path, "w", newline="") as f: | |
| w = csv.writer(f) | |
| w.writerow(["symbol", "kind", "file", "line", "word_count"]) | |
| for r in rows: | |
| if r[5] == "internal": | |
| w.writerow(r[:5]) | |
| # Stats | |
| total = len(rows) | |
| by_origin = Counter(r[5] for r in rows) | |
| by_kind = Counter(r[1] for r in rows) | |
| internal = [r for r in rows if r[5] == "internal"] | |
| unique_internal = len({(r[0], r[1]) for r in internal}) | |
| print(f"\nscanned: {total} occurrences") | |
| print(f" external {by_origin.get('external', 0)}") | |
| print(f" internal {by_origin.get('internal', 0)}") | |
| print(f" unique internal symbols: {unique_internal}") | |
| print() | |
| print("by kind:") | |
| for k, v in sorted(by_kind.items()): | |
| print(f" {k:<20} {v}") | |
| print() | |
| print(f"wrote: {audit_path}") | |
| print(f"wrote: {internal_path}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Symbol backlog manager. Works with the output of symbol-audit.py. | |
| Storage format: TOML at the path given by --backlog (default: /tmp/symbols-backlog.toml). | |
| Each symbol is a section keyed by symbol name with fields: | |
| paths, kind, label, to, words | |
| Commands: | |
| init — create a fresh backlog from symbols-internal.csv | |
| unlabelled — print all symbols not yet classified | |
| label — apply a classification to symbols matching a pattern | |
| labels — print built-in classification labels with descriptions | |
| stats — print classification counts | |
| export — export backlog as CSV or JSON | |
| summary — print a markdown summary grouped by label | |
| Usage: | |
| python3 scripts/symbol-backlog.py init --from /tmp/symbols-internal.csv | |
| python3 scripts/symbol-backlog.py unlabelled | |
| python3 scripts/symbol-backlog.py label truncate --symbols 'authHeader,baseUrl' --to 'header,url' | |
| python3 scripts/symbol-backlog.py label as-is --match 'snake_case:*' | |
| python3 scripts/symbol-backlog.py labels | |
| python3 scripts/symbol-backlog.py stats | |
| python3 scripts/symbol-backlog.py export --label truncate --format csv | |
| python3 scripts/symbol-backlog.py summary | |
| """ | |
| import argparse | |
| import csv | |
| import fnmatch | |
| import json | |
| import os | |
| import re | |
| import sys | |
| from collections import Counter | |
| DEFAULT_BACKLOG = "/tmp/symbols-backlog.toml" | |
| # ── Built-in labels ────────────────────────────────────────────────────────── | |
| LABELS = { | |
| "subject.verb": "Property/action belongs on an existing entity; refactor to natural OOP", | |
| "state-machine": "Process/action should be a first-class object; Process.run() style", | |
| "truncate": "Symbol can be shortened without collision", | |
| "prefix-grouping": "Multiple prefix* locals → const prefix = { a, b, c }", | |
| "suffix-grouping": "Multiple *suffix locals → const suffix = { a, b }", | |
| "kv-ify": "Host/port or similar pairs → { host, port }", | |
| "snakify": "camelCase field should be snake_case to match convention", | |
| "kebabify": "Event names should use kebab-case", | |
| "json-yaml": "Config/log schema field; has a `to` target for standardizing", | |
| "api": "External API field name; do not rename", | |
| "as-is": "Exempt: function name, filename, comment text, or user decision", | |
| "needs-context": "Revisit later with more context", | |
| } | |
| # ── Helpers ────────────────────────────────────────────────────────────────── | |
| def split_words(sym): | |
| clean = sym.lstrip("#_") | |
| parts = re.findall(r'[A-Z]?[a-z0-9]+|[A-Z]+(?=[A-Z]|$)', clean) | |
| if not parts: | |
| parts = clean.split("_") | |
| return [p.lower() for p in parts if p] | |
| def load(path): | |
| if not os.path.exists(path): | |
| print(f"error: {path} not found. Run 'init' first.", file=sys.stderr) | |
| sys.exit(1) | |
| entries = {} | |
| current = None | |
| with open(path) as f: | |
| for raw in f: | |
| line = raw.rstrip("\n") | |
| m = re.match(r'^\[(.+)\]$', line) | |
| if m: | |
| current = m.group(1) | |
| entries[current] = {"paths": [], "kind": "", "label": "", "to": "", "words": []} | |
| continue | |
| if current is None: | |
| continue | |
| kv = re.match(r'^(\w+)\s*=\s*(.+)$', line) | |
| if kv: | |
| key, val = kv.group(1), kv.group(2).strip() | |
| if val.startswith("[") and val.endswith("]"): | |
| items = [s.strip().strip('"').strip("'") for s in val[1:-1].split(",") if s.strip()] | |
| entries[current][key] = items | |
| elif val.startswith('"') and val.endswith('"'): | |
| entries[current][key] = val[1:-1] | |
| else: | |
| entries[current][key] = val | |
| return entries | |
| def save(entries, path): | |
| with open(path, "w") as f: | |
| for sym in sorted(entries, key=lambda s: (entries[s]["kind"], s.lower())): | |
| e = entries[sym] | |
| f.write(f"[{sym}]\n") | |
| paths = ", ".join(f'"{p}"' for p in e["paths"]) | |
| f.write(f"paths = [{paths}]\n") | |
| f.write(f'kind = "{e["kind"]}"\n') | |
| f.write(f'label = "{e["label"]}"\n') | |
| if e.get("to"): | |
| f.write(f'to = "{e["to"]}"\n') | |
| words = ", ".join(f'"{w}"' for w in e["words"]) | |
| f.write(f"words = [{words}]\n") | |
| f.write("\n") | |
| # ── Commands ───────────────────────────────────────────────────────────────── | |
| def cmd_init(args): | |
| if not os.path.exists(args.source): | |
| print(f"error: {args.source} not found", file=sys.stderr) | |
| sys.exit(1) | |
| seen = {} | |
| for row in csv.DictReader(open(args.source)): | |
| k = (row["symbol"], row["kind"]) | |
| if k not in seen: | |
| seen[k] = [] | |
| if row["file"] not in seen[k]: | |
| seen[k].append(row["file"]) | |
| entries = {} | |
| for (sym, kind), paths in seen.items(): | |
| entries[sym] = { | |
| "paths": paths, | |
| "kind": kind, | |
| "label": "", | |
| "to": "", | |
| "words": split_words(sym), | |
| } | |
| save(entries, args.backlog) | |
| print(f"initialized {len(entries)} unique symbols in {args.backlog}") | |
| def cmd_unlabelled(args): | |
| entries = load(args.backlog) | |
| unlabelled = {s: e for s, e in entries.items() if not e["label"]} | |
| if not unlabelled: | |
| print("all symbols are labelled") | |
| return | |
| by_kind = {} | |
| for sym, e in unlabelled.items(): | |
| by_kind.setdefault(e["kind"], []).append((sym, e)) | |
| for kind in sorted(by_kind): | |
| items = by_kind[kind] | |
| print(f"\n{kind} ({len(items)}):") | |
| for sym, e in sorted(items, key=lambda x: x[0].lower()): | |
| print(f" {sym:<40} {e['paths'][0]}") | |
| print(f"\n{len(unlabelled)} unlabelled / {len(entries)} total") | |
| def cmd_label(args): | |
| entries = load(args.backlog) | |
| changed = 0 | |
| def apply(sym): | |
| nonlocal changed | |
| entries[sym]["label"] = args.classification | |
| if args.to: | |
| entries[sym]["to"] = args.to | |
| changed += 1 | |
| if args.symbols: | |
| targets = set(s.strip() for s in args.symbols.split(",")) | |
| for sym in targets: | |
| if sym in entries: | |
| apply(sym) | |
| elif args.match: | |
| if ":" in args.match: | |
| kind_pat, sym_pat = args.match.split(":", 1) | |
| else: | |
| kind_pat, sym_pat = "*", args.match | |
| for sym, e in entries.items(): | |
| if fnmatch.fnmatch(e["kind"], kind_pat) and fnmatch.fnmatch(sym, sym_pat): | |
| apply(sym) | |
| else: | |
| print("error: provide --symbols or --match", file=sys.stderr) | |
| sys.exit(1) | |
| save(entries, args.backlog) | |
| print(f"labelled {changed} symbols as '{args.classification}'") | |
| def cmd_labels(_args): | |
| print("Built-in classification labels:\n") | |
| for name, desc in LABELS.items(): | |
| print(f" {name:<16} — {desc}") | |
| print(f"\n{len(LABELS)} labels. Any string is accepted; these are conventions.") | |
| def cmd_stats(args): | |
| entries = load(args.backlog) | |
| c = Counter(e["label"] or "(unlabelled)" for e in entries.values()) | |
| print(f"total: {len(entries)}") | |
| for k, v in sorted(c.items()): | |
| print(f" {k:<20} {v}") | |
| def cmd_export(args): | |
| entries = load(args.backlog) | |
| filtered = entries | |
| if args.label: | |
| filtered = {s: e for s, e in entries.items() if e["label"] == args.label} | |
| if args.format == "json": | |
| out = [] | |
| for sym, e in sorted(filtered.items()): | |
| out.append({"symbol": sym, **e}) | |
| print(json.dumps(out, indent=2)) | |
| else: | |
| w = csv.writer(sys.stdout) | |
| w.writerow(["symbol", "kind", "label", "to", "paths", "words"]) | |
| for sym in sorted(filtered, key=lambda s: (filtered[s]["label"], s.lower())): | |
| e = filtered[sym] | |
| w.writerow([sym, e["kind"], e["label"], e.get("to", ""), | |
| ";".join(e["paths"]), " ".join(e["words"])]) | |
| def cmd_summary(args): | |
| entries = load(args.backlog) | |
| c = Counter(e["label"] or "(unlabelled)" for e in entries.values()) | |
| total = len(entries) | |
| print(f"## Symbol Backlog Summary ({total} symbols)\n") | |
| print("| Label | Count | Description |") | |
| print("|---|---|---|") | |
| for label, count in sorted(c.items(), key=lambda x: (-x[1], x[0])): | |
| desc = LABELS.get(label, "") | |
| print(f"| `{label}` | {count} | {desc} |") | |
| print() | |
| # ── CLI ────────────────────────────────────────────────────────────────────── | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Symbol backlog manager") | |
| parser.add_argument("--backlog", default=DEFAULT_BACKLOG, | |
| help=f"Path to backlog TOML (default: {DEFAULT_BACKLOG})") | |
| sub = parser.add_subparsers(dest="command") | |
| p_init = sub.add_parser("init", help="Create fresh backlog from symbols-internal.csv") | |
| p_init.add_argument("--from", dest="source", default="/tmp/symbols-internal.csv") | |
| sub.add_parser("unlabelled", help="Print unlabelled symbols") | |
| sub.add_parser("labels", help="Print built-in classification labels") | |
| sub.add_parser("stats", help="Print classification counts") | |
| sub.add_parser("summary", help="Print markdown summary") | |
| p_label = sub.add_parser("label", help="Apply a classification") | |
| p_label.add_argument("classification", help="Label to apply") | |
| p_label.add_argument("--symbols", help="Comma-separated symbol names") | |
| p_label.add_argument("--match", help="Pattern: 'kind:symbol_glob' or 'symbol_glob'") | |
| p_label.add_argument("--to", help="Target name after refactoring (optional)") | |
| p_export = sub.add_parser("export", help="Export backlog as CSV or JSON") | |
| p_export.add_argument("--label", help="Filter to a specific label") | |
| p_export.add_argument("--format", choices=["csv", "json"], default="csv") | |
| args = parser.parse_args() | |
| if args.command == "init": | |
| cmd_init(args) | |
| elif args.command == "unlabelled": | |
| cmd_unlabelled(args) | |
| elif args.command == "label": | |
| cmd_label(args) | |
| elif args.command == "labels": | |
| cmd_labels(args) | |
| elif args.command == "stats": | |
| cmd_stats(args) | |
| elif args.command == "export": | |
| cmd_export(args) | |
| elif args.command == "summary": | |
| cmd_summary(args) | |
| else: | |
| parser.print_help() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment