#!/usr/bin/env python3
"""Extract a compact, markdown-like outline from a reveal.js HTML deck.

This is intentionally lossy. It strips scripts, styles, SVGs, vendor markup,
and most HTML structure while preserving rough slide boundaries and visible
text. The output is meant for fast agent inspection, not publishing.
"""

from __future__ import annotations

import argparse
import html
import re
import sys
from html.parser import HTMLParser
from pathlib import Path


SKIP_TAGS = {
    "canvas",
    "head",
    "math",
    "metadata",
    "noscript",
    "script",
    "style",
    "svg",
    "template",
}

VOID_TAGS = {
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "link",
    "meta",
    "param",
    "source",
    "track",
    "wbr",
}

BLOCK_TAGS = {
    "address",
    "article",
    "aside",
    "blockquote",
    "br",
    "dd",
    "div",
    "dl",
    "dt",
    "figcaption",
    "figure",
    "footer",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "header",
    "hr",
    "li",
    "main",
    "nav",
    "ol",
    "p",
    "pre",
    "table",
    "td",
    "th",
    "tr",
    "ul",
}


class RevealTextParser(HTMLParser):
    def __init__(self) -> None:
        super().__init__(convert_charrefs=False)
        self.parts: list[str] = []
        self.skip_depth = 0
        self.section_depth = 0

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        tag = tag.lower()
        attrs_dict = {name.lower(): value or "" for name, value in attrs}
        classes = set(attrs_dict.get("class", "").split())

        if self.skip_depth or tag in SKIP_TAGS or "notes" in classes:
            if tag not in VOID_TAGS:
                self.skip_depth += 1
            return

        if tag == "section":
            self.section_depth += 1
            level = "#" * min(self.section_depth, 6)
            self._newline(force=True)
            self.parts.append(f"\n\n{level} --- slide ---\n\n")
            return

        if tag == "img":
            label = (
                attrs_dict.get("alt")
                or attrs_dict.get("title")
                or Path(attrs_dict.get("src", "")).name
            )
            if label:
                self.parts.append(f" [image: {label}] ")
            return

        if tag in {"video", "audio", "iframe"}:
            label = attrs_dict.get("title") or Path(attrs_dict.get("src", "")).name or tag
            self.parts.append(f" [{tag}: {label}] ")
            self.skip_depth += 1
            return

        if tag == "li":
            self._newline()
            self.parts.append("- ")
        elif tag in BLOCK_TAGS:
            self._newline()

    def handle_endtag(self, tag: str) -> None:
        tag = tag.lower()
        if self.skip_depth:
            self.skip_depth -= 1
            return
        if tag == "section" and self.section_depth:
            self.section_depth -= 1
            self._newline()
        elif tag in BLOCK_TAGS:
            self._newline()

    def handle_data(self, data: str) -> None:
        if self.skip_depth:
            return
        text = html.unescape(data)
        text = re.sub(r"\s+", " ", text)
        if text.strip():
            self.parts.append(text)

    def handle_entityref(self, name: str) -> None:
        self.handle_data(f"&{name};")

    def handle_charref(self, name: str) -> None:
        self.handle_data(f"&#{name};")

    def _newline(self, force: bool = False) -> None:
        if force or (self.parts and not self.parts[-1].endswith("\n")):
            self.parts.append("\n")


def simplify_html(text: str) -> str:
    parser = RevealTextParser()
    parser.feed(text)
    parser.close()
    rough = "".join(parser.parts)

    rough = re.sub(r"[ \t]+\n", "\n", rough)
    rough = re.sub(r"\n{3,}", "\n\n", rough)
    rough = re.sub(r"[ \t]{2,}", " ", rough)
    lines = [line.strip() for line in rough.splitlines()]

    cleaned: list[str] = []
    previous_blank = False
    for line in lines:
        if not line:
            if not previous_blank:
                cleaned.append("")
            previous_blank = True
            continue
        previous_blank = False
        cleaned.append(line)

    return "\n".join(cleaned).strip() + "\n"


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Print a compact markdown-like outline from a reveal.js HTML file."
    )
    parser.add_argument("html_file", type=Path)
    parser.add_argument(
        "--max-chars",
        type=int,
        default=60000,
        help="truncate output after this many characters; use 0 for no truncation",
    )
    args = parser.parse_args()

    try:
        source = args.html_file.read_text(encoding="utf-8", errors="replace")
    except OSError as exc:
        print(f"reveal_text.py: {exc}", file=sys.stderr)
        return 1

    output = simplify_html(source)
    if args.max_chars and len(output) > args.max_chars:
        output = output[: args.max_chars].rstrip() + "\n\n[truncated]\n"
    print(output, end="")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())