#!/usr/bin/env python3
"""Merge per-agent jsonl status into downloads.yaml.

Reads docs/research/inspiration/.status/agent-*.jsonl, builds {url: (status, local_path)}
map, then rewrites docs/research/downloads.yaml in-place updating status + local_path
fields for each matching url block. Preserves all other content verbatim.
"""
import json
import re
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[4]  # claude_skill_owl/
STATUS_DIR = ROOT / "docs" / "research" / "inspiration" / ".status"
YAML_PATH = ROOT / "docs" / "research" / "downloads.yaml"

# Normalize PDF urls (agent-2 emitted arxiv pdf urls but yaml only has abs urls).
def canonicalize(url: str) -> str:
    m = re.match(r"^https://arxiv\.org/pdf/(.+)\.pdf$", url)
    if m:
        return f"https://arxiv.org/abs/{m.group(1)}"
    # RFC txt mirror urls don't appear in yaml; normalize to datatracker html
    m = re.match(r"^https://www\.rfc-editor\.org/rfc/rfc(\d+)\.txt$", url)
    if m:
        return f"https://datatracker.ietf.org/doc/html/rfc{m.group(1)}"
    return url

# Build url -> {status, local_path, notes (combined)} merging companion files (PDF+HTML).
results: dict[str, dict] = {}
for jsonl in sorted(STATUS_DIR.glob("agent-*.jsonl")):
    for line in jsonl.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line:
            continue
        rec = json.loads(line)
        canon = canonicalize(rec["url"])
        existing = results.get(canon)
        if existing is None:
            results[canon] = {
                "status": rec["status"],
                "local_path": rec.get("local_path") or "",
                "notes": rec.get("notes") or "",
            }
        else:
            # Companion fetch (PDF for arxiv, txt for RFC). Append local_path.
            if rec["status"] == "downloaded" and rec.get("local_path"):
                if existing["local_path"]:
                    existing["local_path"] = f"{existing['local_path']};{rec['local_path']}"
                else:
                    existing["local_path"] = rec["local_path"]
            # If either failed, prefer "downloaded" if at least one succeeded
            if existing["status"] != "downloaded" and rec["status"] == "downloaded":
                existing["status"] = "downloaded"

# Now rewrite yaml.
yaml_text = YAML_PATH.read_text(encoding="utf-8")
lines = yaml_text.split("\n")

out_lines: list[str] = []
i = 0
matched = 0
unmatched_yaml_urls: list[str] = []

while i < len(lines):
    line = lines[i]
    # Match start of an item: `  - url: <url>`
    m = re.match(r"^  - url: (\S+)\s*$", line)
    if not m:
        out_lines.append(line)
        i += 1
        continue
    url = m.group(1)
    out_lines.append(line)
    i += 1
    rec = results.get(url)
    # Emit subsequent lines until blank line or next item, patching status + local_path.
    while i < len(lines):
        sub = lines[i]
        if re.match(r"^  - url:", sub) or sub.strip() == "":
            break
        if rec:
            if sub.startswith("    status:"):
                out_lines.append(f"    status: {rec['status']}")
                i += 1
                continue
            if sub.startswith("    local_path:"):
                if rec["status"] == "downloaded" and rec["local_path"]:
                    # quote paths to be yaml-safe
                    paths = rec["local_path"]
                    out_lines.append(f"    local_path: \"{paths}\"")
                else:
                    out_lines.append("    local_path: null")
                i += 1
                continue
        out_lines.append(sub)
        i += 1
    if rec:
        matched += 1
    else:
        unmatched_yaml_urls.append(url)

YAML_PATH.write_text("\n".join(out_lines), encoding="utf-8")

downloaded = sum(1 for v in results.values() if v["status"] == "downloaded")
failed = sum(1 for v in results.values() if v["status"] != "downloaded")
print(f"yaml urls matched: {matched}")
print(f"yaml urls unmatched: {len(unmatched_yaml_urls)}")
for u in unmatched_yaml_urls:
    print(f"  - {u}")
print(f"jsonl unique urls (after canonicalize): {len(results)}")
print(f"  downloaded: {downloaded}")
print(f"  failed:     {failed}")
