feat(scripts): backfill full_text for unrated drafts with rev fallback
Some checks failed
CI / test (3.11) (push) Failing after 9s
CI / test (3.12) (push) Failing after 9s

The fetch pipeline left the 128 new drafts (2026-05 delta) without full
text — the per-source text download skipped them. This script downloads
text from the IETF archive using the recorded revision, probing 00-09 and
the bare name as fallback. Run before 'analyze' so rating uses full text.
This commit is contained in:
2026-05-22 11:56:39 +02:00
parent d9b1243a96
commit 3e8e52ffe3

View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""Backfill full_text for unrated drafts that are missing it.
The fetch pipeline sometimes leaves new IETF drafts without full text (the per-source
text download can skip them). Analysis quality depends on full text, so this script
downloads it directly from the IETF archive using the draft's recorded revision, with
a fallback that probes nearby revisions if the recorded one 404s.
Usage:
PYTHONPATH=src python3 scripts/backfill-unrated-text.py [--all] [--limit N]
--all Backfill ALL drafts missing full_text (not just unrated ones).
--limit Cap the number of drafts processed.
"""
from __future__ import annotations
import argparse
import sqlite3
import sys
import time
sys.path.insert(0, "src")
import httpx
from ietf_analyzer.config import Config
TEXT_BASE = "https://www.ietf.org/archive/id"
def candidate_urls(name: str, rev: str) -> list[str]:
"""URLs to try, in order: recorded rev, then 00..09, then bare name."""
urls = [f"{TEXT_BASE}/{name}-{rev}.txt"] if rev else []
for r in (f"{i:02d}" for i in range(10)):
if r != rev:
urls.append(f"{TEXT_BASE}/{name}-{r}.txt")
urls.append(f"{TEXT_BASE}/{name}.txt")
return urls
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--all", action="store_true", help="all drafts missing text, not just unrated")
ap.add_argument("--limit", type=int, default=0)
args = ap.parse_args()
cfg = Config.load()
conn = sqlite3.connect(cfg.db_path)
conn.row_factory = sqlite3.Row
where_unrated = "" if args.all else (
"AND NOT EXISTS (SELECT 1 FROM ratings r WHERE r.draft_name = d.name)"
)
sql = f"""
SELECT name, rev FROM drafts d
WHERE (full_text IS NULL OR full_text = '')
AND source = 'ietf'
{where_unrated}
ORDER BY name
"""
rows = conn.execute(sql).fetchall()
if args.limit:
rows = rows[: args.limit]
print(f"Backfilling text for {len(rows)} drafts...")
client = httpx.Client(timeout=30, follow_redirects=True)
ok = fail = 0
for i, row in enumerate(rows, 1):
name, rev = row["name"], row["rev"] or "00"
text = None
for url in candidate_urls(name, rev):
try:
resp = client.get(url)
if resp.status_code == 200 and len(resp.text) > 200:
text = resp.text[:500_000] # cap 500K
break
except httpx.HTTPError:
continue
if text:
conn.execute("UPDATE drafts SET full_text = ? WHERE name = ?", (text, name))
conn.commit()
ok += 1
print(f" [{i}/{len(rows)}] OK {name} ({len(text)} chars)")
else:
fail += 1
print(f" [{i}/{len(rows)}] FAIL {name}")
time.sleep(0.3)
print(f"\nDone. {ok} downloaded, {fail} failed.")
conn.close()
return 0
if __name__ == "__main__":
raise SystemExit(main())