From 3e8e52ffe39688600e5e1f6c3273a0fb5ee02c51 Mon Sep 17 00:00:00 2001
From: Christian Nennemann <christian@nennemann.de>
Date: Fri, 22 May 2026 11:56:39 +0200
Subject: [PATCH] feat(scripts): backfill full_text for unrated drafts with rev
 fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fetch pipeline left the 128 new drafts (2026-05 delta) without full
text — the per-source text download skipped them. This script downloads
text from the IETF archive using the recorded revision, probing 00-09 and
the bare name as fallback. Run before 'analyze' so rating uses full text.
---
 scripts/backfill-unrated-text.py | 95 ++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 scripts/backfill-unrated-text.py

diff --git a/scripts/backfill-unrated-text.py b/scripts/backfill-unrated-text.py
new file mode 100644
index 0000000..dbf1731
--- /dev/null
+++ b/scripts/backfill-unrated-text.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""Backfill full_text for unrated drafts that are missing it.
+
+The fetch pipeline sometimes leaves new IETF drafts without full text (the per-source
+text download can skip them). Analysis quality depends on full text, so this script
+downloads it directly from the IETF archive using the draft's recorded revision, with
+a fallback that probes nearby revisions if the recorded one 404s.
+
+Usage:
+    PYTHONPATH=src python3 scripts/backfill-unrated-text.py [--all] [--limit N]
+
+    --all    Backfill ALL drafts missing full_text (not just unrated ones).
+    --limit  Cap the number of drafts processed.
+"""
+from __future__ import annotations
+
+import argparse
+import sqlite3
+import sys
+import time
+
+sys.path.insert(0, "src")
+
+import httpx
+
+from ietf_analyzer.config import Config
+
+TEXT_BASE = "https://www.ietf.org/archive/id"
+
+
+def candidate_urls(name: str, rev: str) -> list[str]:
+    """URLs to try, in order: recorded rev, then 00..09, then bare name."""
+    urls = [f"{TEXT_BASE}/{name}-{rev}.txt"] if rev else []
+    for r in (f"{i:02d}" for i in range(10)):
+        if r != rev:
+            urls.append(f"{TEXT_BASE}/{name}-{r}.txt")
+    urls.append(f"{TEXT_BASE}/{name}.txt")
+    return urls
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--all", action="store_true", help="all drafts missing text, not just unrated")
+    ap.add_argument("--limit", type=int, default=0)
+    args = ap.parse_args()
+
+    cfg = Config.load()
+    conn = sqlite3.connect(cfg.db_path)
+    conn.row_factory = sqlite3.Row
+
+    where_unrated = "" if args.all else (
+        "AND NOT EXISTS (SELECT 1 FROM ratings r WHERE r.draft_name = d.name)"
+    )
+    sql = f"""
+        SELECT name, rev FROM drafts d
+        WHERE (full_text IS NULL OR full_text = '')
+          AND source = 'ietf'
+          {where_unrated}
+        ORDER BY name
+    """
+    rows = conn.execute(sql).fetchall()
+    if args.limit:
+        rows = rows[: args.limit]
+
+    print(f"Backfilling text for {len(rows)} drafts...")
+    client = httpx.Client(timeout=30, follow_redirects=True)
+    ok = fail = 0
+    for i, row in enumerate(rows, 1):
+        name, rev = row["name"], row["rev"] or "00"
+        text = None
+        for url in candidate_urls(name, rev):
+            try:
+                resp = client.get(url)
+                if resp.status_code == 200 and len(resp.text) > 200:
+                    text = resp.text[:500_000]  # cap 500K
+                    break
+            except httpx.HTTPError:
+                continue
+        if text:
+            conn.execute("UPDATE drafts SET full_text = ? WHERE name = ?", (text, name))
+            conn.commit()
+            ok += 1
+            print(f"  [{i}/{len(rows)}] OK   {name} ({len(text)} chars)")
+        else:
+            fail += 1
+            print(f"  [{i}/{len(rows)}] FAIL {name}")
+        time.sleep(0.3)
+
+    print(f"\nDone. {ok} downloaded, {fail} failed.")
+    conn.close()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())