feat(scripts): backfill full_text for unrated drafts with rev fallback
The fetch pipeline left the 128 new drafts (2026-05 delta) without full text — the per-source text download skipped them. This script downloads text from the IETF archive using the recorded revision, probing 00-09 and the bare name as fallback. Run before 'analyze' so rating uses full text.
This commit is contained in:
95
scripts/backfill-unrated-text.py
Normal file
95
scripts/backfill-unrated-text.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Backfill full_text for unrated drafts that are missing it.
|
||||||
|
|
||||||
|
The fetch pipeline sometimes leaves new IETF drafts without full text (the per-source
|
||||||
|
text download can skip them). Analysis quality depends on full text, so this script
|
||||||
|
downloads it directly from the IETF archive using the draft's recorded revision, with
|
||||||
|
a fallback that probes nearby revisions if the recorded one 404s.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
PYTHONPATH=src python3 scripts/backfill-unrated-text.py [--all] [--limit N]
|
||||||
|
|
||||||
|
--all Backfill ALL drafts missing full_text (not just unrated ones).
|
||||||
|
--limit Cap the number of drafts processed.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
sys.path.insert(0, "src")
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from ietf_analyzer.config import Config
|
||||||
|
|
||||||
|
TEXT_BASE = "https://www.ietf.org/archive/id"
|
||||||
|
|
||||||
|
|
||||||
|
def candidate_urls(name: str, rev: str) -> list[str]:
|
||||||
|
"""URLs to try, in order: recorded rev, then 00..09, then bare name."""
|
||||||
|
urls = [f"{TEXT_BASE}/{name}-{rev}.txt"] if rev else []
|
||||||
|
for r in (f"{i:02d}" for i in range(10)):
|
||||||
|
if r != rev:
|
||||||
|
urls.append(f"{TEXT_BASE}/{name}-{r}.txt")
|
||||||
|
urls.append(f"{TEXT_BASE}/{name}.txt")
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--all", action="store_true", help="all drafts missing text, not just unrated")
|
||||||
|
ap.add_argument("--limit", type=int, default=0)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
cfg = Config.load()
|
||||||
|
conn = sqlite3.connect(cfg.db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
where_unrated = "" if args.all else (
|
||||||
|
"AND NOT EXISTS (SELECT 1 FROM ratings r WHERE r.draft_name = d.name)"
|
||||||
|
)
|
||||||
|
sql = f"""
|
||||||
|
SELECT name, rev FROM drafts d
|
||||||
|
WHERE (full_text IS NULL OR full_text = '')
|
||||||
|
AND source = 'ietf'
|
||||||
|
{where_unrated}
|
||||||
|
ORDER BY name
|
||||||
|
"""
|
||||||
|
rows = conn.execute(sql).fetchall()
|
||||||
|
if args.limit:
|
||||||
|
rows = rows[: args.limit]
|
||||||
|
|
||||||
|
print(f"Backfilling text for {len(rows)} drafts...")
|
||||||
|
client = httpx.Client(timeout=30, follow_redirects=True)
|
||||||
|
ok = fail = 0
|
||||||
|
for i, row in enumerate(rows, 1):
|
||||||
|
name, rev = row["name"], row["rev"] or "00"
|
||||||
|
text = None
|
||||||
|
for url in candidate_urls(name, rev):
|
||||||
|
try:
|
||||||
|
resp = client.get(url)
|
||||||
|
if resp.status_code == 200 and len(resp.text) > 200:
|
||||||
|
text = resp.text[:500_000] # cap 500K
|
||||||
|
break
|
||||||
|
except httpx.HTTPError:
|
||||||
|
continue
|
||||||
|
if text:
|
||||||
|
conn.execute("UPDATE drafts SET full_text = ? WHERE name = ?", (text, name))
|
||||||
|
conn.commit()
|
||||||
|
ok += 1
|
||||||
|
print(f" [{i}/{len(rows)}] OK {name} ({len(text)} chars)")
|
||||||
|
else:
|
||||||
|
fail += 1
|
||||||
|
print(f" [{i}/{len(rows)}] FAIL {name}")
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
print(f"\nDone. {ok} downloaded, {fail} failed.")
|
||||||
|
conn.close()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
Reference in New Issue
Block a user