#!/usr/bin/env python3 """Extract a compact, markdown-like outline from a reveal.js HTML deck. This is intentionally lossy. It strips scripts, styles, SVGs, vendor markup, and most HTML structure while preserving rough slide boundaries and visible text. The output is meant for fast agent inspection, not publishing. """ from __future__ import annotations import argparse import html import re import sys from html.parser import HTMLParser from pathlib import Path SKIP_TAGS = { "canvas", "head", "math", "metadata", "noscript", "script", "style", "svg", "template", } VOID_TAGS = { "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", } BLOCK_TAGS = { "address", "article", "aside", "blockquote", "br", "dd", "div", "dl", "dt", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "ol", "p", "pre", "table", "td", "th", "tr", "ul", } class RevealTextParser(HTMLParser): def __init__(self) -> None: super().__init__(convert_charrefs=False) self.parts: list[str] = [] self.skip_depth = 0 self.section_depth = 0 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: tag = tag.lower() attrs_dict = {name.lower(): value or "" for name, value in attrs} classes = set(attrs_dict.get("class", "").split()) if self.skip_depth or tag in SKIP_TAGS or "notes" in classes: if tag not in VOID_TAGS: self.skip_depth += 1 return if tag == "section": self.section_depth += 1 level = "#" * min(self.section_depth, 6) self._newline(force=True) self.parts.append(f"\n\n{level} --- slide ---\n\n") return if tag == "img": label = ( attrs_dict.get("alt") or attrs_dict.get("title") or Path(attrs_dict.get("src", "")).name ) if label: self.parts.append(f" [image: {label}] ") return if tag in {"video", "audio", "iframe"}: label = attrs_dict.get("title") or Path(attrs_dict.get("src", "")).name or tag self.parts.append(f" [{tag}: {label}] ") self.skip_depth += 1 return if tag == "li": self._newline() self.parts.append("- ") elif tag in BLOCK_TAGS: self._newline() def handle_endtag(self, tag: str) -> None: tag = tag.lower() if self.skip_depth: self.skip_depth -= 1 return if tag == "section" and self.section_depth: self.section_depth -= 1 self._newline() elif tag in BLOCK_TAGS: self._newline() def handle_data(self, data: str) -> None: if self.skip_depth: return text = html.unescape(data) text = re.sub(r"\s+", " ", text) if text.strip(): self.parts.append(text) def handle_entityref(self, name: str) -> None: self.handle_data(f"&{name};") def handle_charref(self, name: str) -> None: self.handle_data(f"&#{name};") def _newline(self, force: bool = False) -> None: if force or (self.parts and not self.parts[-1].endswith("\n")): self.parts.append("\n") def simplify_html(text: str) -> str: parser = RevealTextParser() parser.feed(text) parser.close() rough = "".join(parser.parts) rough = re.sub(r"[ \t]+\n", "\n", rough) rough = re.sub(r"\n{3,}", "\n\n", rough) rough = re.sub(r"[ \t]{2,}", " ", rough) lines = [line.strip() for line in rough.splitlines()] cleaned: list[str] = [] previous_blank = False for line in lines: if not line: if not previous_blank: cleaned.append("") previous_blank = True continue previous_blank = False cleaned.append(line) return "\n".join(cleaned).strip() + "\n" def main() -> int: parser = argparse.ArgumentParser( description="Print a compact markdown-like outline from a reveal.js HTML file." ) parser.add_argument("html_file", type=Path) parser.add_argument( "--max-chars", type=int, default=60000, help="truncate output after this many characters; use 0 for no truncation", ) args = parser.parse_args() try: source = args.html_file.read_text(encoding="utf-8", errors="replace") except OSError as exc: print(f"reveal_text.py: {exc}", file=sys.stderr) return 1 output = simplify_html(source) if args.max_chars and len(output) > args.max_chars: output = output[: args.max_chars].rstrip() + "\n\n[truncated]\n" print(output, end="") return 0 if __name__ == "__main__": raise SystemExit(main())