Skip to content

Instantly share code, notes, and snippets.

@anonhostpi
Created May 10, 2026 17:51
Show Gist options
  • Select an option

  • Save anonhostpi/bc138130e96aa4bab17a07227bae47c7 to your computer and use it in GitHub Desktop.

Select an option

Save anonhostpi/bc138130e96aa4bab17a07227bae47c7 to your computer and use it in GitHub Desktop.
Symbol audit & backlog tools — find multi-word symbols, classify, track refactoring progress
#!/usr/bin/env python3
"""
Multi-word symbol scanner for any codebase.
Scans source files for multi-word identifiers (camelCase, snake_case, etc.),
optionally classifies them as internal vs external, and outputs CSVs.
Output:
symbols-audit.csv — all occurrences with origin classification
symbols-internal.csv — internal-only, origin column dropped
Usage:
python3 scripts/symbol-audit.py [--root /path/to/repo]
python3 scripts/symbol-audit.py --exclude kebab,pascal,screaming
python3 scripts/symbol-audit.py --only camel,snake
python3 scripts/symbol-audit.py --ext .ts,.yaml --skip dist,generated
python3 scripts/symbol-audit.py --externals externals.txt --include 'src/**'
python3 scripts/symbol-audit.py --min-words 3
"""
import argparse
import csv
import fnmatch
import os
import re
import sys
from collections import Counter
# ── Defaults ─────────────────────────────────────────────────────────────────
DEFAULT_SKIP = {".git", "node_modules", "vendor", "dist", ".cache"}
DEFAULT_EXTS = {".ts", ".js", ".tsx", ".jsx", ".py", ".go", ".rs", ".yaml", ".yml"}
# ── Patterns ─────────────────────────────────────────────────────────────────
ALL_PATTERNS = {
"camelCase": re.compile(r'\b[a-z]\w*[A-Z]\w*\b'),
"_camelCase": re.compile(r'\b_[a-z]\w*[A-Z]\w*\b'),
"#camelCase": re.compile(r'#[a-z]\w*[A-Z]\w*\b'),
"PascalCase": re.compile(r'\b(?:[A-Z][a-z][a-zA-Z0-9]*){2,}\b'),
"_PascalCase": re.compile(r'\b_[A-Z]\w*[a-z]\w*\b'),
"#PascalCase": re.compile(r'#[A-Z]\w*[a-z]\w*\b'),
"snake_case": re.compile(r'\b[a-z][a-z0-9]*(?:_[a-z][a-z0-9]*)+\b'),
"_snake_case": re.compile(r'\b_[a-z][a-z0-9]*(?:_[a-z][a-z0-9]*)+\b'),
"#snake_case": re.compile(r'#[a-z][a-z0-9]*(?:_[a-z][a-z0-9]*)+\b'),
"SCREAMING_SNAKE": re.compile(r'\b[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+\b'),
"_SCREAMING_SNAKE": re.compile(r'\b_[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+\b'),
"#SCREAMING_SNAKE": re.compile(r'#[A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+\b'),
"kebab-case": re.compile(r'\b[a-z][a-z0-9]*(?:-[a-z0-9]+)+\b'),
}
GROUPS = {
"camel": ["camelCase", "_camelCase", "#camelCase"],
"pascal": ["PascalCase", "_PascalCase", "#PascalCase"],
"snake": ["snake_case", "_snake_case", "#snake_case"],
"screaming": ["SCREAMING_SNAKE", "_SCREAMING_SNAKE", "#SCREAMING_SNAKE"],
"kebab": ["kebab-case"],
}
# ── Helpers ──────────────────────────────────────────────────────────────────
def load_externals(path: str) -> set[str]:
if not path:
return set()
with open(path) as f:
return {line.strip() for line in f if line.strip() and not line.startswith("#")}
def matches_include(rel: str, patterns: list[str]) -> bool:
if not patterns:
return True
return any(fnmatch.fnmatch(rel, p) for p in patterns)
def build_line_index(lines: list[str]) -> list[int]:
pos, idx = 0, [0]
for line in lines:
pos += len(line)
idx.append(pos)
return idx
def line_of(idx: list[int], pos: int) -> int:
lo, hi = 0, len(idx) - 1
while lo < hi:
mid = (lo + hi + 1) // 2
if idx[mid] <= pos:
lo = mid
else:
hi = mid - 1
return lo + 1
def word_count(sym: str) -> int:
clean = sym.lstrip("#_")
parts = re.findall(r'[A-Z]?[a-z0-9]+|[A-Z]+(?=[A-Z]|$)', clean)
if not parts:
parts = clean.split("_")
return len([p for p in parts if p])
def resolve_patterns(only, exclude):
if only:
groups = [g.strip() for g in only.split(",")]
names = set()
for g in groups:
names.update(GROUPS.get(g, [g]))
return [(k, v) for k, v in ALL_PATTERNS.items() if k in names]
if exclude:
groups = [g.strip() for g in exclude.split(",")]
names = set()
for g in groups:
names.update(GROUPS.get(g, [g]))
return [(k, v) for k, v in ALL_PATTERNS.items() if k not in names]
return list(ALL_PATTERNS.items())
# ── Scanner ──────────────────────────────────────────────────────────────────
def scan(
root: str,
patterns: list[tuple[str, re.Pattern]],
skip: set[str],
exts: set[str],
includes: list[str],
externals: set[str],
min_words: int,
) -> list[tuple]:
rows = []
for dirpath, dirs, files in os.walk(root):
dirs[:] = [d for d in dirs if d not in skip and not d.startswith(".")]
for fname in files:
fpath = os.path.join(dirpath, fname)
ext = os.path.splitext(fname)[1]
if ext not in exts:
continue
rel = os.path.relpath(fpath, root)
if not matches_include(rel, includes):
continue
try:
lines = open(fpath, errors="replace").readlines()
except Exception:
continue
text = "".join(lines)
idx = build_line_index(lines)
for kind, pat in patterns:
for m in pat.finditer(text):
sym = m.group()
wc = word_count(sym)
if wc < min_words:
continue
origin = "external" if sym in externals else "internal"
rows.append((sym, kind, rel, line_of(idx, m.start()), wc, origin))
return rows
# ── Main ─────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Multi-word symbol scanner")
parser.add_argument("--root", default=".", help="Repository root (default: cwd)")
parser.add_argument("--out", default="/tmp", help="Output directory (default: /tmp)")
parser.add_argument("--only", default=None,
help="Comma-separated pattern groups to include: camel,pascal,snake,screaming,kebab")
parser.add_argument("--exclude", default=None,
help="Comma-separated pattern groups to exclude: camel,pascal,snake,screaming,kebab")
parser.add_argument("--ext", default=None,
help="Comma-separated file extensions to scan (default: .ts,.js,.tsx,.jsx,.py,.go,.rs,.yaml,.yml)")
parser.add_argument("--skip", default=None,
help="Comma-separated directory names to skip (default: .git,node_modules,vendor,dist,.cache)")
parser.add_argument("--include", action="append", default=[],
help="Glob pattern for files to include (can be repeated; default: all files)")
parser.add_argument("--externals", default=None,
help="Path to a file listing external symbols (one per line)")
parser.add_argument("--min-words", type=int, default=2,
help="Minimum word count to report (default: 2)")
args = parser.parse_args()
root = os.path.abspath(args.root)
patterns = resolve_patterns(args.only, args.exclude)
if not patterns:
print("error: no patterns selected", file=sys.stderr)
sys.exit(1)
exts = set(e.strip() if e.startswith(".") else f".{e.strip()}"
for e in args.ext.split(",")) if args.ext else DEFAULT_EXTS
skip = set(d.strip() for d in args.skip.split(",")) if args.skip else DEFAULT_SKIP
externals = load_externals(args.externals) if args.externals else set()
print(f"scanning with: {', '.join(k for k, _ in patterns)}")
if args.include:
print(f"include: {args.include}")
rows = scan(root, patterns, skip, exts, args.include, externals, args.min_words)
rows.sort(key=lambda x: (x[1], x[0].lower()))
# Full audit with origin
audit_path = os.path.join(args.out, "symbols-audit.csv")
with open(audit_path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["symbol", "kind", "file", "line", "word_count", "origin"])
w.writerows(rows)
# Internal only, no origin column
internal_path = os.path.join(args.out, "symbols-internal.csv")
with open(internal_path, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["symbol", "kind", "file", "line", "word_count"])
for r in rows:
if r[5] == "internal":
w.writerow(r[:5])
# Stats
total = len(rows)
by_origin = Counter(r[5] for r in rows)
by_kind = Counter(r[1] for r in rows)
internal = [r for r in rows if r[5] == "internal"]
unique_internal = len({(r[0], r[1]) for r in internal})
print(f"\nscanned: {total} occurrences")
print(f" external {by_origin.get('external', 0)}")
print(f" internal {by_origin.get('internal', 0)}")
print(f" unique internal symbols: {unique_internal}")
print()
print("by kind:")
for k, v in sorted(by_kind.items()):
print(f" {k:<20} {v}")
print()
print(f"wrote: {audit_path}")
print(f"wrote: {internal_path}")
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""
Symbol backlog manager. Works with the output of symbol-audit.py.
Storage format: TOML at the path given by --backlog (default: /tmp/symbols-backlog.toml).
Each symbol is a section keyed by symbol name with fields:
paths, kind, label, to, words
Commands:
init — create a fresh backlog from symbols-internal.csv
unlabelled — print all symbols not yet classified
label — apply a classification to symbols matching a pattern
labels — print built-in classification labels with descriptions
stats — print classification counts
export — export backlog as CSV or JSON
summary — print a markdown summary grouped by label
Usage:
python3 scripts/symbol-backlog.py init --from /tmp/symbols-internal.csv
python3 scripts/symbol-backlog.py unlabelled
python3 scripts/symbol-backlog.py label truncate --symbols 'authHeader,baseUrl' --to 'header,url'
python3 scripts/symbol-backlog.py label as-is --match 'snake_case:*'
python3 scripts/symbol-backlog.py labels
python3 scripts/symbol-backlog.py stats
python3 scripts/symbol-backlog.py export --label truncate --format csv
python3 scripts/symbol-backlog.py summary
"""
import argparse
import csv
import fnmatch
import json
import os
import re
import sys
from collections import Counter
DEFAULT_BACKLOG = "/tmp/symbols-backlog.toml"
# ── Built-in labels ──────────────────────────────────────────────────────────
LABELS = {
"subject.verb": "Property/action belongs on an existing entity; refactor to natural OOP",
"state-machine": "Process/action should be a first-class object; Process.run() style",
"truncate": "Symbol can be shortened without collision",
"prefix-grouping": "Multiple prefix* locals → const prefix = { a, b, c }",
"suffix-grouping": "Multiple *suffix locals → const suffix = { a, b }",
"kv-ify": "Host/port or similar pairs → { host, port }",
"snakify": "camelCase field should be snake_case to match convention",
"kebabify": "Event names should use kebab-case",
"json-yaml": "Config/log schema field; has a `to` target for standardizing",
"api": "External API field name; do not rename",
"as-is": "Exempt: function name, filename, comment text, or user decision",
"needs-context": "Revisit later with more context",
}
# ── Helpers ──────────────────────────────────────────────────────────────────
def split_words(sym):
clean = sym.lstrip("#_")
parts = re.findall(r'[A-Z]?[a-z0-9]+|[A-Z]+(?=[A-Z]|$)', clean)
if not parts:
parts = clean.split("_")
return [p.lower() for p in parts if p]
def load(path):
if not os.path.exists(path):
print(f"error: {path} not found. Run 'init' first.", file=sys.stderr)
sys.exit(1)
entries = {}
current = None
with open(path) as f:
for raw in f:
line = raw.rstrip("\n")
m = re.match(r'^\[(.+)\]$', line)
if m:
current = m.group(1)
entries[current] = {"paths": [], "kind": "", "label": "", "to": "", "words": []}
continue
if current is None:
continue
kv = re.match(r'^(\w+)\s*=\s*(.+)$', line)
if kv:
key, val = kv.group(1), kv.group(2).strip()
if val.startswith("[") and val.endswith("]"):
items = [s.strip().strip('"').strip("'") for s in val[1:-1].split(",") if s.strip()]
entries[current][key] = items
elif val.startswith('"') and val.endswith('"'):
entries[current][key] = val[1:-1]
else:
entries[current][key] = val
return entries
def save(entries, path):
with open(path, "w") as f:
for sym in sorted(entries, key=lambda s: (entries[s]["kind"], s.lower())):
e = entries[sym]
f.write(f"[{sym}]\n")
paths = ", ".join(f'"{p}"' for p in e["paths"])
f.write(f"paths = [{paths}]\n")
f.write(f'kind = "{e["kind"]}"\n')
f.write(f'label = "{e["label"]}"\n')
if e.get("to"):
f.write(f'to = "{e["to"]}"\n')
words = ", ".join(f'"{w}"' for w in e["words"])
f.write(f"words = [{words}]\n")
f.write("\n")
# ── Commands ─────────────────────────────────────────────────────────────────
def cmd_init(args):
if not os.path.exists(args.source):
print(f"error: {args.source} not found", file=sys.stderr)
sys.exit(1)
seen = {}
for row in csv.DictReader(open(args.source)):
k = (row["symbol"], row["kind"])
if k not in seen:
seen[k] = []
if row["file"] not in seen[k]:
seen[k].append(row["file"])
entries = {}
for (sym, kind), paths in seen.items():
entries[sym] = {
"paths": paths,
"kind": kind,
"label": "",
"to": "",
"words": split_words(sym),
}
save(entries, args.backlog)
print(f"initialized {len(entries)} unique symbols in {args.backlog}")
def cmd_unlabelled(args):
entries = load(args.backlog)
unlabelled = {s: e for s, e in entries.items() if not e["label"]}
if not unlabelled:
print("all symbols are labelled")
return
by_kind = {}
for sym, e in unlabelled.items():
by_kind.setdefault(e["kind"], []).append((sym, e))
for kind in sorted(by_kind):
items = by_kind[kind]
print(f"\n{kind} ({len(items)}):")
for sym, e in sorted(items, key=lambda x: x[0].lower()):
print(f" {sym:<40} {e['paths'][0]}")
print(f"\n{len(unlabelled)} unlabelled / {len(entries)} total")
def cmd_label(args):
entries = load(args.backlog)
changed = 0
def apply(sym):
nonlocal changed
entries[sym]["label"] = args.classification
if args.to:
entries[sym]["to"] = args.to
changed += 1
if args.symbols:
targets = set(s.strip() for s in args.symbols.split(","))
for sym in targets:
if sym in entries:
apply(sym)
elif args.match:
if ":" in args.match:
kind_pat, sym_pat = args.match.split(":", 1)
else:
kind_pat, sym_pat = "*", args.match
for sym, e in entries.items():
if fnmatch.fnmatch(e["kind"], kind_pat) and fnmatch.fnmatch(sym, sym_pat):
apply(sym)
else:
print("error: provide --symbols or --match", file=sys.stderr)
sys.exit(1)
save(entries, args.backlog)
print(f"labelled {changed} symbols as '{args.classification}'")
def cmd_labels(_args):
print("Built-in classification labels:\n")
for name, desc in LABELS.items():
print(f" {name:<16} — {desc}")
print(f"\n{len(LABELS)} labels. Any string is accepted; these are conventions.")
def cmd_stats(args):
entries = load(args.backlog)
c = Counter(e["label"] or "(unlabelled)" for e in entries.values())
print(f"total: {len(entries)}")
for k, v in sorted(c.items()):
print(f" {k:<20} {v}")
def cmd_export(args):
entries = load(args.backlog)
filtered = entries
if args.label:
filtered = {s: e for s, e in entries.items() if e["label"] == args.label}
if args.format == "json":
out = []
for sym, e in sorted(filtered.items()):
out.append({"symbol": sym, **e})
print(json.dumps(out, indent=2))
else:
w = csv.writer(sys.stdout)
w.writerow(["symbol", "kind", "label", "to", "paths", "words"])
for sym in sorted(filtered, key=lambda s: (filtered[s]["label"], s.lower())):
e = filtered[sym]
w.writerow([sym, e["kind"], e["label"], e.get("to", ""),
";".join(e["paths"]), " ".join(e["words"])])
def cmd_summary(args):
entries = load(args.backlog)
c = Counter(e["label"] or "(unlabelled)" for e in entries.values())
total = len(entries)
print(f"## Symbol Backlog Summary ({total} symbols)\n")
print("| Label | Count | Description |")
print("|---|---|---|")
for label, count in sorted(c.items(), key=lambda x: (-x[1], x[0])):
desc = LABELS.get(label, "")
print(f"| `{label}` | {count} | {desc} |")
print()
# ── CLI ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Symbol backlog manager")
parser.add_argument("--backlog", default=DEFAULT_BACKLOG,
help=f"Path to backlog TOML (default: {DEFAULT_BACKLOG})")
sub = parser.add_subparsers(dest="command")
p_init = sub.add_parser("init", help="Create fresh backlog from symbols-internal.csv")
p_init.add_argument("--from", dest="source", default="/tmp/symbols-internal.csv")
sub.add_parser("unlabelled", help="Print unlabelled symbols")
sub.add_parser("labels", help="Print built-in classification labels")
sub.add_parser("stats", help="Print classification counts")
sub.add_parser("summary", help="Print markdown summary")
p_label = sub.add_parser("label", help="Apply a classification")
p_label.add_argument("classification", help="Label to apply")
p_label.add_argument("--symbols", help="Comma-separated symbol names")
p_label.add_argument("--match", help="Pattern: 'kind:symbol_glob' or 'symbol_glob'")
p_label.add_argument("--to", help="Target name after refactoring (optional)")
p_export = sub.add_parser("export", help="Export backlog as CSV or JSON")
p_export.add_argument("--label", help="Filter to a specific label")
p_export.add_argument("--format", choices=["csv", "json"], default="csv")
args = parser.parse_args()
if args.command == "init":
cmd_init(args)
elif args.command == "unlabelled":
cmd_unlabelled(args)
elif args.command == "label":
cmd_label(args)
elif args.command == "labels":
cmd_labels(args)
elif args.command == "stats":
cmd_stats(args)
elif args.command == "export":
cmd_export(args)
elif args.command == "summary":
cmd_summary(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment