feat(scripts): backfill full_text for unrated drafts with rev fallback
The fetch pipeline left the 128 new drafts (2026-05 delta) without full text — the per-source text download skipped them. This script downloads text from the IETF archive using the recorded revision, probing 00-09 and the bare name as fallback. Run before 'analyze' so rating uses full text.
This commit is contained in:
95
scripts/backfill-unrated-text.py
Normal file
95
scripts/backfill-unrated-text.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Backfill full_text for unrated drafts that are missing it.
|
||||
|
||||
The fetch pipeline sometimes leaves new IETF drafts without full text (the per-source
|
||||
text download can skip them). Analysis quality depends on full text, so this script
|
||||
downloads it directly from the IETF archive using the draft's recorded revision, with
|
||||
a fallback that probes nearby revisions if the recorded one 404s.
|
||||
|
||||
Usage:
|
||||
PYTHONPATH=src python3 scripts/backfill-unrated-text.py [--all] [--limit N]
|
||||
|
||||
--all Backfill ALL drafts missing full_text (not just unrated ones).
|
||||
--limit Cap the number of drafts processed.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, "src")
|
||||
|
||||
import httpx
|
||||
|
||||
from ietf_analyzer.config import Config
|
||||
|
||||
TEXT_BASE = "https://www.ietf.org/archive/id"
|
||||
|
||||
|
||||
def candidate_urls(name: str, rev: str) -> list[str]:
|
||||
"""URLs to try, in order: recorded rev, then 00..09, then bare name."""
|
||||
urls = [f"{TEXT_BASE}/{name}-{rev}.txt"] if rev else []
|
||||
for r in (f"{i:02d}" for i in range(10)):
|
||||
if r != rev:
|
||||
urls.append(f"{TEXT_BASE}/{name}-{r}.txt")
|
||||
urls.append(f"{TEXT_BASE}/{name}.txt")
|
||||
return urls
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--all", action="store_true", help="all drafts missing text, not just unrated")
|
||||
ap.add_argument("--limit", type=int, default=0)
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = Config.load()
|
||||
conn = sqlite3.connect(cfg.db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
where_unrated = "" if args.all else (
|
||||
"AND NOT EXISTS (SELECT 1 FROM ratings r WHERE r.draft_name = d.name)"
|
||||
)
|
||||
sql = f"""
|
||||
SELECT name, rev FROM drafts d
|
||||
WHERE (full_text IS NULL OR full_text = '')
|
||||
AND source = 'ietf'
|
||||
{where_unrated}
|
||||
ORDER BY name
|
||||
"""
|
||||
rows = conn.execute(sql).fetchall()
|
||||
if args.limit:
|
||||
rows = rows[: args.limit]
|
||||
|
||||
print(f"Backfilling text for {len(rows)} drafts...")
|
||||
client = httpx.Client(timeout=30, follow_redirects=True)
|
||||
ok = fail = 0
|
||||
for i, row in enumerate(rows, 1):
|
||||
name, rev = row["name"], row["rev"] or "00"
|
||||
text = None
|
||||
for url in candidate_urls(name, rev):
|
||||
try:
|
||||
resp = client.get(url)
|
||||
if resp.status_code == 200 and len(resp.text) > 200:
|
||||
text = resp.text[:500_000] # cap 500K
|
||||
break
|
||||
except httpx.HTTPError:
|
||||
continue
|
||||
if text:
|
||||
conn.execute("UPDATE drafts SET full_text = ? WHERE name = ?", (text, name))
|
||||
conn.commit()
|
||||
ok += 1
|
||||
print(f" [{i}/{len(rows)}] OK {name} ({len(text)} chars)")
|
||||
else:
|
||||
fail += 1
|
||||
print(f" [{i}/{len(rows)}] FAIL {name}")
|
||||
time.sleep(0.3)
|
||||
|
||||
print(f"\nDone. {ok} downloaded, {fail} failed.")
|
||||
conn.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user