#!/usr/bin/env python3 """Backfill full_text for unrated drafts that are missing it. The fetch pipeline sometimes leaves new IETF drafts without full text (the per-source text download can skip them). Analysis quality depends on full text, so this script downloads it directly from the IETF archive using the draft's recorded revision, with a fallback that probes nearby revisions if the recorded one 404s. Usage: PYTHONPATH=src python3 scripts/backfill-unrated-text.py [--all] [--limit N] --all Backfill ALL drafts missing full_text (not just unrated ones). --limit Cap the number of drafts processed. """ from __future__ import annotations import argparse import sqlite3 import sys import time sys.path.insert(0, "src") import httpx from ietf_analyzer.config import Config TEXT_BASE = "https://www.ietf.org/archive/id" def candidate_urls(name: str, rev: str) -> list[str]: """URLs to try, in order: recorded rev, then 00..09, then bare name.""" urls = [f"{TEXT_BASE}/{name}-{rev}.txt"] if rev else [] for r in (f"{i:02d}" for i in range(10)): if r != rev: urls.append(f"{TEXT_BASE}/{name}-{r}.txt") urls.append(f"{TEXT_BASE}/{name}.txt") return urls def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--all", action="store_true", help="all drafts missing text, not just unrated") ap.add_argument("--limit", type=int, default=0) args = ap.parse_args() cfg = Config.load() conn = sqlite3.connect(cfg.db_path) conn.row_factory = sqlite3.Row where_unrated = "" if args.all else ( "AND NOT EXISTS (SELECT 1 FROM ratings r WHERE r.draft_name = d.name)" ) sql = f""" SELECT name, rev FROM drafts d WHERE (full_text IS NULL OR full_text = '') AND source = 'ietf' {where_unrated} ORDER BY name """ rows = conn.execute(sql).fetchall() if args.limit: rows = rows[: args.limit] print(f"Backfilling text for {len(rows)} drafts...") client = httpx.Client(timeout=30, follow_redirects=True) ok = fail = 0 for i, row in enumerate(rows, 1): name, rev = row["name"], row["rev"] or "00" text = None for url in candidate_urls(name, rev): try: resp = client.get(url) if resp.status_code == 200 and len(resp.text) > 200: text = resp.text[:500_000] # cap 500K break except httpx.HTTPError: continue if text: conn.execute("UPDATE drafts SET full_text = ? WHERE name = ?", (text, name)) conn.commit() ok += 1 print(f" [{i}/{len(rows)}] OK {name} ({len(text)} chars)") else: fail += 1 print(f" [{i}/{len(rows)}] FAIL {name}") time.sleep(0.3) print(f"\nDone. {ok} downloaded, {fail} failed.") conn.close() return 0 if __name__ == "__main__": raise SystemExit(main())