From 3e8e52ffe39688600e5e1f6c3273a0fb5ee02c51 Mon Sep 17 00:00:00 2001 From: Christian Nennemann Date: Fri, 22 May 2026 11:56:39 +0200 Subject: [PATCH] feat(scripts): backfill full_text for unrated drafts with rev fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fetch pipeline left the 128 new drafts (2026-05 delta) without full text — the per-source text download skipped them. This script downloads text from the IETF archive using the recorded revision, probing 00-09 and the bare name as fallback. Run before 'analyze' so rating uses full text. --- scripts/backfill-unrated-text.py | 95 ++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 scripts/backfill-unrated-text.py diff --git a/scripts/backfill-unrated-text.py b/scripts/backfill-unrated-text.py new file mode 100644 index 0000000..dbf1731 --- /dev/null +++ b/scripts/backfill-unrated-text.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Backfill full_text for unrated drafts that are missing it. + +The fetch pipeline sometimes leaves new IETF drafts without full text (the per-source +text download can skip them). Analysis quality depends on full text, so this script +downloads it directly from the IETF archive using the draft's recorded revision, with +a fallback that probes nearby revisions if the recorded one 404s. + +Usage: + PYTHONPATH=src python3 scripts/backfill-unrated-text.py [--all] [--limit N] + + --all Backfill ALL drafts missing full_text (not just unrated ones). + --limit Cap the number of drafts processed. +""" +from __future__ import annotations + +import argparse +import sqlite3 +import sys +import time + +sys.path.insert(0, "src") + +import httpx + +from ietf_analyzer.config import Config + +TEXT_BASE = "https://www.ietf.org/archive/id" + + +def candidate_urls(name: str, rev: str) -> list[str]: + """URLs to try, in order: recorded rev, then 00..09, then bare name.""" + urls = [f"{TEXT_BASE}/{name}-{rev}.txt"] if rev else [] + for r in (f"{i:02d}" for i in range(10)): + if r != rev: + urls.append(f"{TEXT_BASE}/{name}-{r}.txt") + urls.append(f"{TEXT_BASE}/{name}.txt") + return urls + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--all", action="store_true", help="all drafts missing text, not just unrated") + ap.add_argument("--limit", type=int, default=0) + args = ap.parse_args() + + cfg = Config.load() + conn = sqlite3.connect(cfg.db_path) + conn.row_factory = sqlite3.Row + + where_unrated = "" if args.all else ( + "AND NOT EXISTS (SELECT 1 FROM ratings r WHERE r.draft_name = d.name)" + ) + sql = f""" + SELECT name, rev FROM drafts d + WHERE (full_text IS NULL OR full_text = '') + AND source = 'ietf' + {where_unrated} + ORDER BY name + """ + rows = conn.execute(sql).fetchall() + if args.limit: + rows = rows[: args.limit] + + print(f"Backfilling text for {len(rows)} drafts...") + client = httpx.Client(timeout=30, follow_redirects=True) + ok = fail = 0 + for i, row in enumerate(rows, 1): + name, rev = row["name"], row["rev"] or "00" + text = None + for url in candidate_urls(name, rev): + try: + resp = client.get(url) + if resp.status_code == 200 and len(resp.text) > 200: + text = resp.text[:500_000] # cap 500K + break + except httpx.HTTPError: + continue + if text: + conn.execute("UPDATE drafts SET full_text = ? WHERE name = ?", (text, name)) + conn.commit() + ok += 1 + print(f" [{i}/{len(rows)}] OK {name} ({len(text)} chars)") + else: + fail += 1 + print(f" [{i}/{len(rows)}] FAIL {name}") + time.sleep(0.3) + + print(f"\nDone. {ok} downloaded, {fail} failed.") + conn.close() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())