#!/usr/bin/env python3 """Merge per-agent jsonl status into downloads.yaml. Reads docs/research/inspiration/.status/agent-*.jsonl, builds {url: (status, local_path)} map, then rewrites docs/research/downloads.yaml in-place updating status + local_path fields for each matching url block. Preserves all other content verbatim. """ import json import re import sys from pathlib import Path ROOT = Path(__file__).resolve().parents[4] # claude_skill_owl/ STATUS_DIR = ROOT / "docs" / "research" / "inspiration" / ".status" YAML_PATH = ROOT / "docs" / "research" / "downloads.yaml" # Normalize PDF urls (agent-2 emitted arxiv pdf urls but yaml only has abs urls). def canonicalize(url: str) -> str: m = re.match(r"^https://arxiv\.org/pdf/(.+)\.pdf$", url) if m: return f"https://arxiv.org/abs/{m.group(1)}" # RFC txt mirror urls don't appear in yaml; normalize to datatracker html m = re.match(r"^https://www\.rfc-editor\.org/rfc/rfc(\d+)\.txt$", url) if m: return f"https://datatracker.ietf.org/doc/html/rfc{m.group(1)}" return url # Build url -> {status, local_path, notes (combined)} merging companion files (PDF+HTML). results: dict[str, dict] = {} for jsonl in sorted(STATUS_DIR.glob("agent-*.jsonl")): for line in jsonl.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue rec = json.loads(line) canon = canonicalize(rec["url"]) existing = results.get(canon) if existing is None: results[canon] = { "status": rec["status"], "local_path": rec.get("local_path") or "", "notes": rec.get("notes") or "", } else: # Companion fetch (PDF for arxiv, txt for RFC). Append local_path. if rec["status"] == "downloaded" and rec.get("local_path"): if existing["local_path"]: existing["local_path"] = f"{existing['local_path']};{rec['local_path']}" else: existing["local_path"] = rec["local_path"] # If either failed, prefer "downloaded" if at least one succeeded if existing["status"] != "downloaded" and rec["status"] == "downloaded": existing["status"] = "downloaded" # Now rewrite yaml. yaml_text = YAML_PATH.read_text(encoding="utf-8") lines = yaml_text.split("\n") out_lines: list[str] = [] i = 0 matched = 0 unmatched_yaml_urls: list[str] = [] while i < len(lines): line = lines[i] # Match start of an item: ` - url: ` m = re.match(r"^ - url: (\S+)\s*$", line) if not m: out_lines.append(line) i += 1 continue url = m.group(1) out_lines.append(line) i += 1 rec = results.get(url) # Emit subsequent lines until blank line or next item, patching status + local_path. while i < len(lines): sub = lines[i] if re.match(r"^ - url:", sub) or sub.strip() == "": break if rec: if sub.startswith(" status:"): out_lines.append(f" status: {rec['status']}") i += 1 continue if sub.startswith(" local_path:"): if rec["status"] == "downloaded" and rec["local_path"]: # quote paths to be yaml-safe paths = rec["local_path"] out_lines.append(f" local_path: \"{paths}\"") else: out_lines.append(" local_path: null") i += 1 continue out_lines.append(sub) i += 1 if rec: matched += 1 else: unmatched_yaml_urls.append(url) YAML_PATH.write_text("\n".join(out_lines), encoding="utf-8") downloaded = sum(1 for v in results.values() if v["status"] == "downloaded") failed = sum(1 for v in results.values() if v["status"] != "downloaded") print(f"yaml urls matched: {matched}") print(f"yaml urls unmatched: {len(unmatched_yaml_urls)}") for u in unmatched_yaml_urls: print(f" - {u}") print(f"jsonl unique urls (after canonicalize): {len(results)}") print(f" downloaded: {downloaded}") print(f" failed: {failed}")