Replace knit with generate_toc.py (#6279)

2026-03-10 09:05:20 +01:00
parent 28c9bed632
commit d848ccc148
11 changed files with 244 additions and 59 deletions
--- a/tools/docs/generate_toc.py
+++ b/tools/docs/generate_toc.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""Generate or verify markdown TOCs between HTML markers."""
+
+from __future__ import annotations
+
+import argparse
+import difflib
+import glob
+import re
+import sys
+from pathlib import Path
+
+TOC_MARKER = "<!--- TOC -->"
+END_MARKER = "<!--- END -->"
+HEADER_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$")
+FENCE_RE = re.compile(r"^\s*(`{3,}|~{3,})")
+
+
+def parse_headers(content: str) -> list[tuple[int, str]]:
+    """Extract markdown headers after the first END marker, excluding fenced code blocks."""
+    end_index = content.find(END_MARKER)
+    if end_index == -1:
+        return []
+
+    scan_region = content[end_index + len(END_MARKER) :]
+    headers: list[tuple[int, str]] = []
+    in_fence = False
+    fence_char = ""
+    fence_len = 0
+
+    for line in scan_region.splitlines():
+        fence_match = FENCE_RE.match(line)
+        if fence_match:
+            fence = fence_match.group(1)
+            if not in_fence:
+                in_fence = True
+                fence_char = fence[0]
+                fence_len = len(fence)
+            elif fence[0] == fence_char and len(fence) >= fence_len:
+                in_fence = False
+            continue
+
+        if in_fence:
+            continue
+
+        match = HEADER_RE.match(line)
+        if not match:
+            continue
+
+        level = len(match.group(1))
+        text = re.sub(r"\s+#+\s*$", "", match.group(2)).strip()
+        if text:
+            headers.append((level, text))
+
+    return headers
+
+
+def _slugify(text: str) -> str:
+    """Generate a markdown anchor similar to GitHub style."""
+    anchor = text.lower().strip()
+    anchor = re.sub(r"[^\w\s-]", "", anchor)
+    anchor = re.sub(r"\s+", "-", anchor)
+    anchor = re.sub(r"-+", "-", anchor).strip("-")
+    return anchor
+
+
+def generate_toc(headers: list[tuple[int, str]]) -> str:
+    """Generate markdown TOC content from parsed headers."""
+    if not headers:
+        return ""
+
+    min_level = min(level for level, _ in headers)
+    slug_counts: dict[str, int] = {}
+    toc_lines: list[str] = []
+
+    for level, text in headers:
+        base_slug = _slugify(text)
+        count = slug_counts.get(base_slug, 0)
+        slug_counts[base_slug] = count + 1
+        slug = base_slug if count == 0 else f"{base_slug}-{count}"
+
+        indent = "  " * (level - min_level)
+        toc_lines.append(f"{indent}* [{text}](#{slug})")
+
+    return "\n".join(toc_lines)
+
+
+def replace_toc_section(content: str, new_toc: str) -> str:
+    """Replace TOC block content between TOC and END markers."""
+    toc_start = content.find(TOC_MARKER)
+    if toc_start == -1:
+        raise ValueError("TOC marker not found")
+
+    toc_end = content.find(END_MARKER, toc_start + len(TOC_MARKER))
+    if toc_end == -1:
+        raise ValueError("END marker not found after TOC marker")
+
+    replacement = f"{TOC_MARKER}\n\n{new_toc}\n\n{END_MARKER}"
+    return content[:toc_start] + replacement + content[toc_end + len(END_MARKER) :]
+
+
+def build_expected_content(content: str) -> str:
+    """Build the expected markdown content after TOC regeneration."""
+    headers = parse_headers(content)
+    toc = generate_toc(headers)
+    return replace_toc_section(content, toc)
+
+
+def resolve_markdown_files(inputs: list[str]) -> list[Path]:
+    """Resolve CLI arguments to a de-duplicated ordered list of markdown files."""
+    files: list[Path] = []
+    seen: set[Path] = set()
+
+    def add_path(candidate: Path) -> None:
+        resolved = candidate.resolve()
+        if resolved.suffix.lower() != ".md" or not resolved.is_file() or resolved in seen:
+            return
+        seen.add(resolved)
+        files.append(resolved)
+
+    for item in inputs:
+        path = Path(item)
+        if path.exists():
+            if path.is_file():
+                add_path(path)
+            elif path.is_dir():
+                for md_file in sorted(path.rglob("*.md")):
+                    add_path(md_file)
+            continue
+
+        for matched in sorted(glob.glob(item, recursive=True)):
+            add_path(Path(matched))
+
+    return files
+
+
+def verify_file(path: Path) -> bool:
+    """Verify whether a file already contains the expected TOC."""
+    content = path.read_text(encoding="utf-8")
+
+    if TOC_MARKER not in content or END_MARKER not in content:
+        print(f"SKIP    | {path} (missing TOC markers)")
+        return True
+
+    expected = build_expected_content(content)
+    if expected == content:
+        print(f"OK      | {path}")
+        return True
+
+    print(f"OUTDATED| {path}", file=sys.stderr)
+    diff = difflib.unified_diff(
+        content.splitlines(),
+        expected.splitlines(),
+        fromfile=f"{path} (current)",
+        tofile=f"{path} (expected)",
+        lineterm="",
+    )
+    for line in diff:
+        print(line, file=sys.stderr)
+    return False
+
+
+def update_file(path: Path) -> bool:
+    """Regenerate and write TOC in a markdown file."""
+    content = path.read_text(encoding="utf-8")
+
+    if TOC_MARKER not in content or END_MARKER not in content:
+        print(f"SKIP    | {path} (missing TOC markers)")
+        return True
+
+    updated = build_expected_content(content)
+    if updated == content:
+        print(f"UNCHANGED| {path}")
+        return True
+
+    path.write_text(updated, encoding="utf-8")
+    print(f"UPDATED | {path}")
+    return True
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Generate markdown TOCs between <!--- TOC --> and <!--- END --> markers."
+    )
+    parser.add_argument("markdown_files", nargs="+", help="Markdown files, directories, or glob patterns")
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Check files without modifying them; returns non-zero when TOC is outdated.",
+    )
+    args = parser.parse_args()
+
+    files = resolve_markdown_files(args.markdown_files)
+    if not files:
+        print("No markdown files were resolved from input arguments.", file=sys.stderr)
+        return 1
+
+    results = [verify_file(path) if args.verify else update_file(path) for path in files]
+    return 0 if all(results) else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+