"""Analysis, visualization, and complex computation data access functions.""" from __future__ import annotations import json import re from collections import Counter, defaultdict from typing import TypedDict import numpy as np from sklearn.cluster import AgglomerativeClustering from sklearn.manifold import TSNE from sklearn.preprocessing import normalize as sk_normalize from ietf_analyzer.config import Config from ietf_analyzer.db import Database SAFETY_CATEGORIES = {"AI safety/alignment", "Agent identity/auth", "Policy/governance"} CAPABILITY_CATEGORIES = {"A2A protocols", "Agent discovery/reg", "Autonomous netops", "Data formats/interop", "Human-agent interaction", "Model serving/inference"} from webui.data._shared import _cached, _extract_month from webui.data.drafts import get_draft_detail _ARCH_LAYERS = [ {"id": "transport", "label": "Transport & Networking", "order": 0, "keywords": {"transport", "network", "routing", "tunnel", "packet", "flow", "traffic", "qos", "sdwan", "mpls", "bgp", "ospf", "segment", "srv6", "quic", "http", "grpc", "mqtt", "yang", "snmp", "netconf", "restconf"}}, {"id": "identity", "label": "Identity & Trust", "order": 1, "keywords": {"identity", "auth", "authentication", "authorization", "credential", "certificate", "trust", "attestation", "oauth", "token", "signing", "verification", "verifiable", "did", "vc", "pki", "spiffe", "acl"}}, {"id": "discovery", "label": "Discovery & Registration", "order": 2, "keywords": {"discovery", "registration", "registry", "catalog", "advertisement", "announce", "capability", "service", "lookup", "resolution", "dns", "directory"}}, {"id": "communication", "label": "Agent Communication", "order": 3, "keywords": {"a2a", "agent", "communication", "message", "messaging", "protocol", "exchange", "negotiation", "handshake", "session", "dialogue", "interaction", "mcp", "interop"}}, {"id": "coordination", "label": "Task & Coordination", "order": 4, "keywords": {"task", "delegation", "orchestration", "workflow", "planning", "coordination", "consensus", "collaboration", "multi-agent", "swarm", "composition", "scheduling"}}, {"id": "intelligence", "label": "AI & Inference", "order": 5, "keywords": {"model", "inference", "learning", "training", "ml", "neural", "llm", "embedding", "reasoning", "decision", "prediction", "classification", "generative", "rag", "fine-tuning"}}, {"id": "safety", "label": "Safety & Governance", "order": 6, "keywords": {"safety", "ethical", "governance", "policy", "audit", "explainability", "transparency", "accountability", "bias", "fairness", "compliance", "regulation", "risk", "shutdown", "alignment", "adversarial", "privacy", "consent"}}, {"id": "application", "label": "Application Domains", "order": 7, "keywords": {"healthcare", "autonomous", "vehicle", "robotics", "iot", "digital twin", "supply chain", "finance", "manufacturing", "energy", "smart", "edge", "cloud", "sensing"}}, ] _LAYER_KEYWORDS = {l["id"]: l["keywords"] for l in _ARCH_LAYERS} class TimelineData(TypedDict): """Monthly category counts from :func:`get_timeline_data`.""" months: list[str] series: dict[str, list[int]] categories: list[str] class SimilarityGraphStats(TypedDict): """Stats sub-dict in similarity graph.""" node_count: int edge_count: int avg_similarity: float class SimilarityGraph(TypedDict): """Draft similarity network from :func:`get_similarity_graph`.""" nodes: list[dict] edges: list[dict] stats: SimilarityGraphStats class CitationGraphStats(TypedDict): """Stats sub-dict in citation graph.""" node_count: int edge_count: int rfc_count: int draft_count: int class CitationGraph(TypedDict): """Citation network from :func:`get_citation_graph`.""" nodes: list[dict] edges: list[dict] stats: CitationGraphStats class MonitorCost(TypedDict): """Cost sub-dict in monitor status.""" input_tokens: int output_tokens: int estimated_usd: float class MonitorPipeline(TypedDict): """Pipeline sub-dict in monitor status.""" total_drafts: int rated: int embedded: int with_ideas: int idea_total: int gap_count: int class MonitorStatus(TypedDict): """Monitor status from :func:`get_monitor_status`.""" last_run: dict | None runs: list[dict] unprocessed: dict[str, int] total_runs: int pipeline: MonitorPipeline cost: MonitorCost def get_ideas_by_type(db: Database) -> dict: """Return ideas grouped by type with counts.""" all_ideas = db.all_ideas() type_counts = Counter(i.get("type", "other") or "other" for i in all_ideas) return { "total": len(all_ideas), "by_type": dict(type_counts.most_common()), "ideas": all_ideas, } def get_idea_detail(db: Database, idea_id: int) -> dict | None: """Return a single idea with source draft info and similar ideas.""" row = db.conn.execute("SELECT * FROM ideas WHERE id = ?", (idea_id,)).fetchone() if not row: return None idea = { "id": row["id"], "title": row["title"], "description": row["description"], "type": row["idea_type"], "draft_name": row["draft_name"], "novelty_score": row["novelty_score"], } # Get source draft info draft = db.get_draft(row["draft_name"]) if draft: idea["draft_title"] = draft.title idea["draft_date"] = draft.date # Get category from ratings rated = db.drafts_with_ratings(limit=2000) for d, r in rated: if d.name == row["draft_name"]: idea["categories"] = r.categories break # Find similar ideas using embeddings similar = [] emb_row = db.conn.execute( "SELECT vector FROM idea_embeddings WHERE idea_id = ?", (idea_id,) ).fetchone() if emb_row: target_vec = np.frombuffer(emb_row["vector"], dtype=np.float32) all_embs = db.all_idea_embeddings() # Compute cosine similarities scores = [] for other_id, other_vec in all_embs.items(): if other_id == idea_id: continue cos_sim = float(np.dot(target_vec, other_vec) / ( np.linalg.norm(target_vec) * np.linalg.norm(other_vec) + 1e-9)) scores.append((other_id, cos_sim)) scores.sort(key=lambda x: x[1], reverse=True) top_5 = scores[:5] # Fetch idea details for top 5 if top_5: ids = [s[0] for s in top_5] sim_map = {s[0]: s[1] for s in top_5} placeholders = ",".join("?" * len(ids)) sim_rows = db.conn.execute( f"SELECT id, title, idea_type, draft_name FROM ideas WHERE id IN ({placeholders})", ids, ).fetchall() sim_dict = {r["id"]: r for r in sim_rows} for sid, score in top_5: sr = sim_dict.get(sid) if sr: similar.append({ "id": sr["id"], "title": sr["title"], "type": sr["idea_type"], "draft_name": sr["draft_name"], "similarity": round(score, 3), }) idea["similar"] = similar return idea def get_timeline_data(db: Database) -> TimelineData: """Return monthly counts by category for timeline chart.""" pairs = db.drafts_with_ratings(limit=1000) all_drafts = db.list_drafts(limit=1000, order_by="time ASC") rating_map = {d.name: r for d, r in pairs} month_cat: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) for d in all_drafts: month = _extract_month(d.time) r = rating_map.get(d.name) if r: cat = r.categories[0] if r.categories else "Other" month_cat[month][cat] += 1 months = sorted(month_cat.keys()) cat_totals: Counter = Counter() for mc in month_cat.values(): for c, cnt in mc.items(): cat_totals[c] += cnt top_cats = [c for c, _ in cat_totals.most_common(10)] series = {} for cat in top_cats: series[cat] = [month_cat[m].get(cat, 0) for m in months] return {"months": months, "series": series, "categories": top_cats} def get_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: """Return draft similarity network (cached).""" return _cached(f"similarity_{threshold}", lambda: _compute_similarity_graph(db, threshold)) def _compute_similarity_graph(db: Database, threshold: float = 0.75) -> SimilarityGraph: """Return draft similarity network for force-directed graph. Returns {nodes: [{name, title, category, score}], edges: [{source, target, similarity}], stats: {node_count, edge_count, avg_similarity}} """ embeddings = db.all_embeddings() if len(embeddings) < 2: return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}} pairs = db.drafts_with_ratings(limit=1000) rating_map = {d.name: r for d, r in pairs} draft_map = {d.name: d for d, _ in pairs} # Filter to drafts with both embeddings and ratings names = [n for n in embeddings if n in rating_map] if len(names) < 2: return {"nodes": [], "edges": [], "stats": {"node_count": 0, "edge_count": 0, "avg_similarity": 0}} matrix = np.array([embeddings[n] for n in names]) # L2-normalize and compute cosine similarity norms = np.linalg.norm(matrix, axis=1, keepdims=True) norms[norms == 0] = 1.0 normalized = matrix / norms sim_matrix = normalized @ normalized.T # Find pairs above threshold (upper triangle only) edges = [] node_set = set() for i in range(len(names)): for j in range(i + 1, len(names)): sim = float(sim_matrix[i, j]) if sim >= threshold: edges.append({"source": names[i], "target": names[j], "similarity": round(sim, 4)}) node_set.add(names[i]) node_set.add(names[j]) # Build nodes from connected drafts only nodes = [] for name in names: if name not in node_set: continue r = rating_map[name] d = draft_map.get(name) nodes.append({ "name": name, "title": d.title if d else name, "category": r.categories[0] if r.categories else "Other", "score": round(r.composite_score, 2), }) avg_sim = round(sum(e["similarity"] for e in edges) / max(len(edges), 1), 4) return { "nodes": nodes, "edges": edges, "stats": {"node_count": len(nodes), "edge_count": len(edges), "avg_similarity": avg_sim}, } def get_idea_clusters(db: Database) -> dict: """Cluster ideas (cached for 5 min).""" return _cached("idea_clusters", lambda: _compute_idea_clusters(db)) def _compute_idea_clusters(db: Database) -> dict: """Cluster ideas by embedding similarity, return clusters + t-SNE scatter. Uses Ward linkage on L2-normalized embeddings (approximates cosine) with a target of ~30 clusters for readable groupings. Enriches each cluster with WG info and category breakdown. """ embeddings = db.all_idea_embeddings() if not embeddings: return {"clusters": [], "scatter": [], "stats": {"total": 0, "clustered": 0, "num_clusters": 0}, "empty": True} # Exclude ideas from false-positive drafts fp_names = db.false_positive_names() # Fetch ideas with IDs for metadata lookup rows = db.conn.execute("SELECT id, title, description, idea_type, draft_name FROM ideas").fetchall() idea_map = {r["id"]: {"title": r["title"], "description": r["description"], "type": r["idea_type"], "draft_name": r["draft_name"]} for r in rows if r["draft_name"] not in fp_names} # Remove FP ideas from embeddings too embeddings = {k: v for k, v in embeddings.items() if k in idea_map} # Draft -> WG and category lookup draft_rows = db.conn.execute('SELECT name, "group", title FROM drafts').fetchall() draft_wg = {r["name"]: r["group"] or "none" for r in draft_rows} draft_title_map = {r["name"]: r["title"] for r in draft_rows} rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings WHERE COALESCE(false_positive, 0) = 0").fetchall() draft_cats: dict[str, list[str]] = {} for r in rating_rows: try: draft_cats[r["draft_name"]] = json.loads(r["categories"]) if r["categories"] else [] except (json.JSONDecodeError, TypeError): draft_cats[r["draft_name"]] = [] # Build matrix from embeddings that have matching ideas idea_ids = [iid for iid in embeddings if iid in idea_map] if len(idea_ids) < 5: return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} matrix = np.array([embeddings[iid] for iid in idea_ids]) matrix_norm = sk_normalize(matrix) # Ward clustering on normalized vectors — target ~30 clusters scaled by dataset size n_target = max(10, min(40, len(idea_ids) // 12)) try: clustering = AgglomerativeClustering(n_clusters=n_target, linkage='ward') labels = clustering.fit_predict(matrix_norm) except Exception: return {"clusters": [], "scatter": [], "stats": {"total": len(idea_ids), "clustered": 0, "num_clusters": 0}, "empty": True} # Build cluster data cluster_ideas_map: dict[int, list] = defaultdict(list) for idx, iid in enumerate(idea_ids): cluster_ideas_map[labels[idx]].append(iid) stop = {"a", "an", "the", "of", "for", "in", "to", "and", "or", "with", "on", "by", "is", "as", "at", "from", "that", "this", "it", "based", "using", "protocol", "mechanism", "framework", "system", "network", "agent", "agents"} clusters = [] for cid in sorted(cluster_ideas_map.keys()): members = cluster_ideas_map[cid] ideas_in_cluster = [idea_map[iid] for iid in members if iid in idea_map] if len(ideas_in_cluster) < 2: continue # Theme: most common significant words in titles words = Counter() for idea in ideas_in_cluster: for w in idea["title"].lower().split(): w_clean = w.strip("()[].,;:-\"'") if len(w_clean) > 2 and w_clean not in stop: words[w_clean] += 1 top_words = [w for w, _ in words.most_common(4)] theme = " ".join(top_words).title() if top_words else f"Cluster {cid}" drafts = list({idea["draft_name"] for idea in ideas_in_cluster}) # Enrich: WG breakdown wg_counts: dict[str, int] = Counter() cat_counts: dict[str, int] = Counter() for dname in drafts: wg = draft_wg.get(dname, "none") wg_counts[wg] += 1 for cat in draft_cats.get(dname, []): cat_counts[cat] += 1 wg_list = [{"wg": wg, "count": cnt} for wg, cnt in wg_counts.most_common(5)] cat_list = [{"cat": cat, "count": cnt} for cat, cnt in cat_counts.most_common(3)] cross_wg = len([w for w in wg_counts if w != "none"]) >= 2 clusters.append({ "id": len(clusters), "theme": theme, "size": len(ideas_in_cluster), "ideas": ideas_in_cluster[:20], "drafts": drafts, "wgs": wg_list, "categories": cat_list, "cross_wg": cross_wg, "wg_count": len(wg_counts), }) clusters.sort(key=lambda c: c["size"], reverse=True) # Build mapping: original cluster label -> sorted index # Each cluster remembers which original label it came from via its member ids old_label_to_new: dict[int, int] = {} for new_idx, c in enumerate(clusters): c["id"] = new_idx # Find original label for any member of this cluster for old_cid, members in cluster_ideas_map.items(): if members and members[0] in [iid for iid in members if iid in idea_map]: member_titles = {idea_map[m]["title"] for m in members if m in idea_map} c_titles = {idea["title"] for idea in c["ideas"]} if member_titles == c_titles or (member_titles & c_titles and len(members) == c["size"]): old_label_to_new[old_cid] = new_idx break # Fallback: build from idea_id -> label mapping iid_to_new: dict[int, int] = {} for old_cid, members in cluster_ideas_map.items(): new_idx = old_label_to_new.get(old_cid, old_cid) for iid in members: iid_to_new[iid] = new_idx # t-SNE for scatter scatter = [] try: perp = min(30, len(idea_ids) - 1) tsne = TSNE(n_components=2, perplexity=perp, random_state=42, max_iter=500) coords = tsne.fit_transform(matrix_norm) for idx, iid in enumerate(idea_ids): info = idea_map.get(iid, {}) scatter.append({ "x": round(float(coords[idx, 0]), 3), "y": round(float(coords[idx, 1]), 3), "cluster_id": iid_to_new.get(iid, int(labels[idx])), "title": info.get("title", ""), "draft_name": info.get("draft_name", ""), "wg": draft_wg.get(info.get("draft_name", ""), ""), }) except Exception: pass # --- Cross-cluster links --- # Find pairs of clusters whose ideas are semantically related # Use centroid similarity + best idea-pair links links = [] if len(clusters) >= 2: # Build cluster centroids from normalized embeddings cluster_centroids = {} cluster_member_indices: dict[int, list[int]] = defaultdict(list) for idx, iid in enumerate(idea_ids): cid = iid_to_new.get(iid, int(labels[idx])) cluster_member_indices[cid].append(idx) for cid, indices in cluster_member_indices.items(): if indices: centroid = matrix_norm[indices].mean(axis=0) norm = np.linalg.norm(centroid) if norm > 0: cluster_centroids[cid] = centroid / norm # Compute pairwise centroid similarity for all cluster pairs cids_sorted = sorted(cluster_centroids.keys()) for ci_idx, ci in enumerate(cids_sorted): for cj in cids_sorted[ci_idx + 1:]: sim = float(np.dot(cluster_centroids[ci], cluster_centroids[cj])) if sim < 0.45: continue # Find the best idea pair across these two clusters best_sim = 0.0 best_pair = (None, None) # Sample up to 20 ideas per cluster to keep it fast ci_members = cluster_member_indices[ci][:20] cj_members = cluster_member_indices[cj][:20] for mi in ci_members: for mj in cj_members: pair_sim = float(np.dot(matrix_norm[mi], matrix_norm[mj])) if pair_sim > best_sim: best_sim = pair_sim best_pair = (idea_ids[mi], idea_ids[mj]) if best_sim < 0.5: continue # Get theme names ci_theme = next((c["theme"] for c in clusters if c["id"] == ci), f"Cluster {ci}") cj_theme = next((c["theme"] for c in clusters if c["id"] == cj), f"Cluster {cj}") idea_a = idea_map.get(best_pair[0], {}) idea_b = idea_map.get(best_pair[1], {}) links.append({ "source": ci, "target": cj, "source_theme": ci_theme, "target_theme": cj_theme, "similarity": round(sim, 3), "best_pair_sim": round(best_sim, 3), "idea_a": idea_a.get("title", ""), "idea_a_draft": idea_a.get("draft_name", ""), "idea_b": idea_b.get("title", ""), "idea_b_draft": idea_b.get("draft_name", ""), }) links.sort(key=lambda l: l["best_pair_sim"], reverse=True) links = links[:50] # cap at top 50 links total = len(idea_ids) clustered = sum(c["size"] for c in clusters) return { "clusters": clusters, "scatter": scatter, "links": links, "stats": {"total": total, "clustered": clustered, "num_clusters": len(clusters)}, "empty": False, } def get_timeline_animation_data(db: Database) -> dict: """Timeline animation (cached for 5 min).""" return _cached("timeline_animation", lambda: _compute_timeline_animation_data(db)) def _compute_timeline_animation_data(db: Database) -> dict: """Compute t-SNE on all drafts, return points with month info + category_monthly. t-SNE is computed once on ALL drafts so coordinates are stable across animation frames. Each point carries a ``month`` field (YYYY-MM) so the front-end can build cumulative animation frames. """ embeddings = db.all_embeddings() if len(embeddings) < 5: return {"points": [], "months": [], "category_monthly": {}} pairs = db.drafts_with_ratings(limit=1000) rating_map = {d.name: r for d, r in pairs} draft_map = {d.name: d for d, _ in pairs} # Filter to drafts that have both embeddings and ratings names = [n for n in embeddings if n in rating_map] if len(names) < 5: return {"points": [], "months": [], "category_monthly": {}} matrix = np.array([embeddings[n] for n in names]) try: tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), random_state=42, max_iter=500) coords = tsne.fit_transform(matrix) except Exception: return {"points": [], "months": [], "category_monthly": {}} # Build points with month points = [] month_set: set[str] = set() category_monthly: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) for i, name in enumerate(names): r = rating_map[name] d = draft_map.get(name) month = _extract_month(d.time if d else None) if month == "unknown": continue # Undated docs (e.g. ISO/ETSI) can't be placed on a temporal animation cat = r.categories[0] if r.categories else "Other" month_set.add(month) category_monthly[month][cat] += 1 points.append({ "name": name, "title": d.title if d else name, "x": round(float(coords[i, 0]), 3), "y": round(float(coords[i, 1]), 3), "category": cat, "score": round(r.composite_score, 2), "month": month, }) # Deliver points in chronological order so the front-end's cumulative # filter (p.month <= frame) is append-only. Otherwise new points get # inserted mid-array and Plotly's index-based frame transition animates # existing markers flying to other drafts' coordinates ("jumping points"). points.sort(key=lambda p: (p["month"], p["name"])) months = sorted(month_set) # Convert defaultdict to plain dict for JSON cat_monthly_plain = {m: dict(cats) for m, cats in category_monthly.items()} return { "points": points, "months": months, "category_monthly": cat_monthly_plain, } def get_monitor_status(db: Database) -> MonitorStatus: """Return monitoring status data for dashboard.""" runs = db.get_monitor_runs(limit=20) last = runs[0] if runs else None total_drafts = db.count_drafts() rated_count = len(db.drafts_with_ratings(limit=10000)) unrated = len(db.unrated_drafts(limit=9999)) unembedded = len(db.drafts_without_embeddings(limit=9999)) embedded_count = total_drafts - unembedded no_ideas = len(db.drafts_without_ideas(limit=9999)) ideas_count = total_drafts - no_ideas idea_total = db.idea_count() gap_count = len(db.all_gaps()) input_tok, output_tok = db.total_tokens_used() # Estimate cost (Sonnet pricing: $3/M input, $15/M output) est_cost = (input_tok * 3.0 / 1_000_000) + (output_tok * 15.0 / 1_000_000) return { "last_run": last, "runs": runs, "unprocessed": {"unrated": unrated, "unembedded": unembedded, "no_ideas": no_ideas}, "total_runs": len(runs), "pipeline": { "total_drafts": total_drafts, "rated": rated_count, "embedded": embedded_count, "with_ideas": ideas_count, "idea_total": idea_total, "gap_count": gap_count, }, "cost": { "input_tokens": input_tok, "output_tokens": output_tok, "estimated_usd": round(est_cost, 2), }, } def get_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: """Return citation graph (cached for 5 min).""" return _cached(f"citation_graph_{min_refs}", lambda: _compute_citation_graph(db, min_refs)) def _compute_citation_graph(db: Database, min_refs: int = 2) -> CitationGraph: """Return citation network data for force-directed graph. Returns {nodes: [{id, type, title, influence, ...}], edges: [{source, target}], stats: {node_count, edge_count, ...}} """ # Get all references rows = db.conn.execute( "SELECT draft_name, ref_type, ref_id FROM draft_refs" ).fetchall() # Count in-degree for each referenced item in_degree: dict[str, int] = Counter() edges_raw = [] for r in rows: ref_key = f"{r['ref_type']}:{r['ref_id']}" in_degree[ref_key] += 1 edges_raw.append((r["draft_name"], ref_key)) # Also count drafts as source nodes draft_out: dict[str, int] = Counter() for draft_name, _ in edges_raw: draft_out[draft_name] += 1 # Get draft titles for labeling draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() draft_titles = {r["name"]: r["title"] for r in draft_rows} # Get rating categories for draft coloring rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() draft_cats = {} for r in rating_rows: try: cats = json.loads(r["categories"]) if r["categories"] else [] draft_cats[r["draft_name"]] = cats[0] if cats else "Other" except Exception: draft_cats[r["draft_name"]] = "Other" # Filter: keep RFCs with min_refs+ references and all drafts that reference them top_refs = {k: v for k, v in in_degree.items() if v >= min_refs} # Build node set node_set = set() filtered_edges = [] for draft_name, ref_key in edges_raw: if ref_key in top_refs: node_set.add(draft_name) node_set.add(ref_key) filtered_edges.append({"source": draft_name, "target": ref_key}) # Limit to ~200 nodes max for readability if len(node_set) > 250: # Keep only refs with higher in-degree sorted_refs = sorted(top_refs.items(), key=lambda x: x[1], reverse=True) keep_refs = set(k for k, _ in sorted_refs[:80]) node_set = set() filtered_edges = [] for draft_name, ref_key in edges_raw: if ref_key in keep_refs: node_set.add(draft_name) node_set.add(ref_key) filtered_edges.append({"source": draft_name, "target": ref_key}) # Build nodes nodes = [] for nid in node_set: if ":" in nid and not nid.startswith("draft-"): # It's a reference node (rfc:1234, bcp:14, etc.) ref_type, ref_id = nid.split(":", 1) influence = in_degree.get(nid, 0) if ref_type == "rfc": try: title = f"RFC {int(ref_id)}" except ValueError: title = f"RFC {ref_id}" else: title = f"{ref_type.upper()} {ref_id}" nodes.append({ "id": nid, "type": ref_type, "title": title, "influence": influence, "ref_id": ref_id, }) else: # It's a draft node influence = in_degree.get(nid, 0) + draft_out.get(nid, 0) nodes.append({ "id": nid, "type": "draft", "title": draft_titles.get(nid, nid), "influence": draft_out.get(nid, 0), "category": draft_cats.get(nid, "Other"), }) # Stats rfc_count = sum(1 for n in nodes if n["type"] == "rfc") draft_count = sum(1 for n in nodes if n["type"] == "draft") return { "nodes": nodes, "edges": filtered_edges, "stats": { "node_count": len(nodes), "edge_count": len(filtered_edges), "rfc_count": rfc_count, "draft_count": draft_count, }, } def get_landscape_tsne(db: Database) -> list[dict]: """Compute t-SNE (cached for 5 min).""" return _cached("landscape_tsne", lambda: _compute_landscape_tsne(db)) def _compute_landscape_tsne(db: Database) -> list[dict]: """Compute t-SNE from embeddings, return [{name, title, x, y, category, score}].""" embeddings = db.all_embeddings() if len(embeddings) < 5: return [] pairs = db.drafts_with_ratings(limit=1000) rating_map = {d.name: r for d, r in pairs} draft_map = {d.name: d for d, _ in pairs} # Filter to drafts that have both embeddings and ratings names = [n for n in embeddings if n in rating_map] if len(names) < 5: return [] matrix = np.array([embeddings[n] for n in names]) try: tsne = TSNE(n_components=2, perplexity=min(30, len(names) - 1), random_state=42, max_iter=500) coords = tsne.fit_transform(matrix) except Exception: return [] result = [] for i, name in enumerate(names): r = rating_map[name] d = draft_map.get(name) result.append({ "name": name, "title": d.title if d else name, "x": round(float(coords[i, 0]), 3), "y": round(float(coords[i, 1]), 3), "category": r.categories[0] if r.categories else "Other", "score": round(r.composite_score, 2), }) return result def get_comparison_data(db: Database, names: list[str]) -> dict | None: """Get comparison data for a list of drafts. Returns { drafts: [{name, title, abstract, rating, ideas, refs, ...}], shared_ideas: [{title, drafts: [name,...]}], unique_ideas: {name: [{title, description}]}, shared_refs: [{type, id, drafts: [name,...]}], unique_refs: {name: [{type, id}]}, similarities: [{a, b, similarity}], comparison_text: str | None, } """ drafts_data = [] all_ideas: dict[str, list[dict]] = {} all_refs: dict[str, list[tuple[str, str]]] = {} for name in names: detail = get_draft_detail(db, name) if not detail: continue drafts_data.append(detail) all_ideas[name] = detail.get("ideas", []) all_refs[name] = [(r["type"], r["id"]) for r in detail.get("refs", [])] if len(drafts_data) < 2: return None # Find shared vs unique ideas (by title similarity) idea_title_drafts: dict[str, list[str]] = {} for name, ideas in all_ideas.items(): for idea in ideas: title_lower = idea["title"].lower().strip() if title_lower not in idea_title_drafts: idea_title_drafts[title_lower] = [] idea_title_drafts[title_lower].append(name) shared_ideas = [ {"title": title, "drafts": draft_list} for title, draft_list in idea_title_drafts.items() if len(set(draft_list)) > 1 ] unique_ideas: dict[str, list[dict]] = {} for name, ideas in all_ideas.items(): unique = [] for idea in ideas: title_lower = idea["title"].lower().strip() if len(set(idea_title_drafts.get(title_lower, []))) <= 1: unique.append({"title": idea["title"], "description": idea.get("description", "")}) unique_ideas[name] = unique # Find shared vs unique references ref_drafts: dict[tuple[str, str], list[str]] = {} for name, refs in all_refs.items(): for ref in refs: if ref not in ref_drafts: ref_drafts[ref] = [] ref_drafts[ref].append(name) shared_refs = [ {"type": ref[0], "id": ref[1], "drafts": draft_list} for ref, draft_list in ref_drafts.items() if len(set(draft_list)) > 1 ] unique_refs: dict[str, list[dict]] = {} for name, refs in all_refs.items(): unique = [] for ref in refs: if len(set(ref_drafts.get(ref, []))) <= 1: unique.append({"type": ref[0], "id": ref[1]}) unique_refs[name] = unique # Pairwise embedding similarities embeddings = db.all_embeddings() similarities = [] valid_names = [d["name"] for d in drafts_data] for i in range(len(valid_names)): for j in range(i + 1, len(valid_names)): a, b = valid_names[i], valid_names[j] if a in embeddings and b in embeddings: vec_a = embeddings[a] vec_b = embeddings[b] dot = np.dot(vec_a, vec_b) norm = np.linalg.norm(vec_a) * np.linalg.norm(vec_b) sim = float(dot / norm) if norm > 0 else 0.0 similarities.append({"a": a, "b": b, "similarity": round(sim, 4)}) return { "drafts": drafts_data, "shared_ideas": shared_ideas, "unique_ideas": unique_ideas, "shared_refs": shared_refs, "unique_refs": unique_refs, "similarities": similarities, "comparison_text": None, } def _classify_to_layer(text: str) -> str: """Classify a piece of text to the best-matching architectural layer.""" text_lower = text.lower() words = set(re.findall(r"[a-z][a-z0-9-]+", text_lower)) scores: dict[str, int] = {} for layer_id, kws in _LAYER_KEYWORDS.items(): scores[layer_id] = len(words & kws) # Also check for multi-word keywords as substrings for kw in kws: if len(kw) > 4 and kw in text_lower: scores[layer_id] += 1 best = max(scores, key=lambda k: scores[k]) return best if scores[best] > 0 else "communication" # default def get_architecture(db: Database) -> dict: """Build system-of-systems architecture from idea clusters, gaps, and source coverage.""" return _cached("architecture", lambda: _compute_architecture(db), ttl=600) def _compute_architecture(db: Database) -> dict: """Compute the architecture view. Returns: { "components": [...], # architectural building blocks "dependencies": [...], # edges between components "gaps": [...], # gaps mapped to layers "layers": [...], # layer definitions "source_coverage": {...}, # per-layer source coverage "stats": {...} } """ # --- Gather raw data --- cluster_data = get_idea_clusters(db) clusters = cluster_data.get("clusters", []) links = cluster_data.get("links", []) all_gaps = db.all_gaps() # Source coverage: count drafts per source per layer draft_rows = db.conn.execute( "SELECT d.name, d.title, d.abstract, d.source, r.categories " "FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name " "WHERE COALESCE(r.false_positive, 0) = 0" ).fetchall() # Build components from idea clusters components = [] cluster_to_component: dict[int, int] = {} # cluster_id -> component index for cl in clusters: if cl["size"] < 3: continue # skip tiny clusters # Determine layer from cluster theme + idea titles text_blob = cl.get("theme", "") for idea in cl.get("ideas", [])[:10]: text_blob += " " + idea.get("title", "") + " " + idea.get("description", "") layer = _classify_to_layer(text_blob) # Source coverage for this component's drafts draft_names = set(cl.get("drafts", [])) sources: Counter = Counter() comp_drafts: list[dict] = [] for dr in draft_rows: if dr["name"] in draft_names: sources[dr["source"] or "ietf"] += 1 comp_drafts.append({"name": dr["name"], "title": (dr["title"] or dr["name"])[:80], "source": dr["source"] or "ietf"}) # Idea type breakdown type_counts: Counter = Counter() for idea in cl.get("ideas", []): t = idea.get("type", "") if t: type_counts[t] += 1 # Maturity: rough proxy from idea count and source diversity maturity = min(5, 1 + len(sources) + (1 if cl["size"] >= 10 else 0) + (1 if cl.get("cross_wg") else 0)) comp = { "id": len(components), "cluster_id": cl["id"], "name": cl.get("theme", f"Component {cl['id']}"), "layer": layer, "size": cl["size"], "draft_count": len(draft_names), "drafts": comp_drafts[:20], "sources": dict(sources.most_common()), "type_breakdown": dict(type_counts.most_common(5)), "maturity": maturity, "wgs": cl.get("wgs", [])[:3], "top_ideas": [{"title": i["title"], "type": i.get("type", ""), "draft_name": i.get("draft_name", "")} for i in cl.get("ideas", [])[:5]], "categories": cl.get("categories", []), } cluster_to_component[cl["id"]] = comp["id"] components.append(comp) # Build dependencies from cross-cluster links dependencies = [] for link in links: src_comp = cluster_to_component.get(link["source"]) tgt_comp = cluster_to_component.get(link["target"]) if src_comp is not None and tgt_comp is not None and src_comp != tgt_comp: dependencies.append({ "source": src_comp, "target": tgt_comp, "similarity": link.get("best_pair_sim", link.get("similarity", 0)), "idea_a": link.get("idea_a", ""), "idea_b": link.get("idea_b", ""), }) # Map gaps to layers gap_items = [] for gap in all_gaps: text = gap["topic"] + " " + gap.get("description", "") + " " + gap.get("category", "") layer = _classify_to_layer(text) gap_items.append({ "id": gap["id"], "topic": gap["topic"], "description": gap["description"], "evidence": gap.get("evidence", ""), "severity": gap.get("severity", "medium"), "category": gap.get("category", ""), "layer": layer, }) # Source coverage per layer source_coverage: dict[str, dict[str, int]] = {l["id"]: Counter() for l in _ARCH_LAYERS} for dr in draft_rows: text = (dr["title"] or "") + " " + (dr["abstract"] or "")[:200] layer = _classify_to_layer(text) source_coverage[layer][dr["source"] or "ietf"] += 1 # Convert Counters to dicts source_coverage = {k: dict(v) for k, v in source_coverage.items()} # Layer summary stats layer_info = [] for l in _ARCH_LAYERS: lid = l["id"] comp_count = sum(1 for c in components if c["layer"] == lid) idea_count = sum(c["size"] for c in components if c["layer"] == lid) gap_count = sum(1 for g in gap_items if g["layer"] == lid) layer_info.append({ "id": l["id"], "label": l["label"], "order": l["order"], "component_count": comp_count, "idea_count": idea_count, "gap_count": gap_count, "coverage": source_coverage.get(lid, {}), "total_drafts": sum(source_coverage.get(lid, {}).values()), }) return { "components": components, "dependencies": dependencies, "gaps": gap_items, "layers": layer_info, "stats": { "total_components": len(components), "total_dependencies": len(dependencies), "total_gaps": len(gap_items), "layers_with_gaps": len(set(g["layer"] for g in gap_items)), }, } def get_idea_analysis(db: Database) -> dict: """Return comprehensive idea analysis data for the idea-analysis page. Includes novelty distribution, type breakdown with avg novelty, top novel ideas, ideas-per-draft distribution, cross-tab of type x source, shared ideas across drafts, and idea novelty vs draft rating correlation. """ from collections import Counter, defaultdict from difflib import SequenceMatcher # Fetch raw data all_ideas = db.conn.execute( """SELECT i.id, i.draft_name, i.title, i.description, i.idea_type, i.novelty_score FROM ideas i ORDER BY i.novelty_score DESC NULLS LAST""" ).fetchall() all_ideas = [dict(r) for r in all_ideas] # Draft ratings lookup ratings_rows = db.conn.execute( """SELECT d.name, d.title as draft_title, d.source, r.novelty AS r_novelty, r.maturity, r.overlap, r.momentum, r.relevance FROM drafts d LEFT JOIN ratings r ON d.name = r.draft_name""" ).fetchall() draft_info = {} for r in ratings_rows: row = dict(r) # Compute composite score (average of 5 dimensions) dims = [row.get("r_novelty"), row.get("maturity"), row.get("overlap"), row.get("momentum"), row.get("relevance")] valid = [d for d in dims if d is not None] row["composite_score"] = sum(valid) / len(valid) if valid else None draft_info[row["name"]] = row total = len(all_ideas) scored = [i for i in all_ideas if i.get("novelty_score") is not None] unscored = total - len(scored) avg_novelty = sum(i["novelty_score"] for i in scored) / len(scored) if scored else 0 # Embedding coverage embed_count = db.conn.execute("SELECT COUNT(*) FROM idea_embeddings").fetchone()[0] # --- Novelty score distribution (histogram) --- novelty_dist = Counter(i["novelty_score"] for i in scored) novelty_histogram = { "labels": [1, 2, 3, 4, 5], "values": [novelty_dist.get(s, 0) for s in [1, 2, 3, 4, 5]], } # --- Ideas by type with counts and avg novelty --- type_data = defaultdict(lambda: {"count": 0, "novelty_sum": 0, "novelty_n": 0}) for idea in all_ideas: t = idea.get("idea_type") or "other" type_data[t]["count"] += 1 if idea.get("novelty_score") is not None: type_data[t]["novelty_sum"] += idea["novelty_score"] type_data[t]["novelty_n"] += 1 by_type = [] for t, d in sorted(type_data.items(), key=lambda x: x[1]["count"], reverse=True): avg = d["novelty_sum"] / d["novelty_n"] if d["novelty_n"] > 0 else 0 by_type.append({"type": t, "count": d["count"], "avg_novelty": round(avg, 2)}) type_names = [t["type"] for t in by_type] # --- Top 20 most novel ideas (score 4-5) --- top_novel = [] for idea in all_ideas: if idea.get("novelty_score") and idea["novelty_score"] >= 4: di = draft_info.get(idea["draft_name"], {}) top_novel.append({ "title": idea["title"], "description": idea["description"], "type": idea.get("idea_type", "other"), "novelty_score": idea["novelty_score"], "draft_name": idea["draft_name"], "draft_title": di.get("draft_title", ""), "draft_score": di.get("composite_score"), }) top_novel.sort(key=lambda x: (x["novelty_score"], x.get("draft_score") or 0), reverse=True) top_novel = top_novel[:20] # --- Ideas per draft distribution --- ideas_per_draft = Counter(i["draft_name"] for i in all_ideas) ipd_dist = Counter(ideas_per_draft.values()) ideas_per_draft_hist = { "labels": sorted(ipd_dist.keys()), "values": [ipd_dist[k] for k in sorted(ipd_dist.keys())], } # Also top drafts by idea count top_idea_drafts = [] for name, count in ideas_per_draft.most_common(10): di = draft_info.get(name, {}) top_idea_drafts.append({ "name": name, "draft_title": di.get("draft_title", ""), "idea_count": count, "score": di.get("composite_score"), }) # --- Cross-tabulation: idea_type x source --- type_source = defaultdict(lambda: defaultdict(int)) for idea in all_ideas: t = idea.get("idea_type") or "other" di = draft_info.get(idea["draft_name"], {}) source = di.get("source", "ietf") or "ietf" type_source[t][source] += 1 sources = sorted(set( di.get("source", "ietf") or "ietf" for di in draft_info.values() )) cross_tab = [] for t in type_names: row = {"type": t} for s in sources: row[s] = type_source[t].get(s, 0) cross_tab.append(row) # --- Shared ideas across drafts --- idea_groups: list[dict] = [] for idea in all_ideas: title_lower = idea["title"].lower().strip() matched = False for group in idea_groups: ratio = SequenceMatcher(None, title_lower, group["canonical"]).ratio() if ratio >= 0.75: group["ideas"].append(idea) group["drafts"].add(idea["draft_name"]) matched = True break if not matched: idea_groups.append({ "canonical": title_lower, "title": idea["title"], "ideas": [idea], "drafts": {idea["draft_name"]}, }) shared_ideas = [] for g in sorted(idea_groups, key=lambda x: len(x["drafts"]), reverse=True): if len(g["drafts"]) < 2: break shared_ideas.append({ "title": g["title"], "appearances": len(g["drafts"]), "drafts": sorted(g["drafts"])[:8], "types": list(set(i.get("idea_type", "other") for i in g["ideas"])), }) # --- Scatter: draft avg idea novelty vs draft relevance --- draft_idea_novelty = defaultdict(list) for idea in scored: draft_idea_novelty[idea["draft_name"]].append(idea["novelty_score"]) scatter_data = [] for name, scores in draft_idea_novelty.items(): di = draft_info.get(name, {}) if di.get("relevance") is not None and di.get("composite_score") is not None: scatter_data.append({ "name": name, "avg_idea_novelty": round(sum(scores) / len(scores), 2), "relevance": di["relevance"], "score": di["composite_score"], "idea_count": len(scores), "source": di.get("source", "ietf") or "ietf", }) # --- Sunburst data: type -> novelty band --- sunburst_labels = [] sunburst_parents = [] sunburst_values = [] # Root sunburst_labels.append("All Ideas") sunburst_parents.append("") sunburst_values.append(total) novelty_bands = {"High (4-5)": lambda s: s is not None and s >= 4, "Medium (3)": lambda s: s is not None and s == 3, "Low (1-2)": lambda s: s is not None and s <= 2, "Unscored": lambda s: s is None} for t_info in by_type: t = t_info["type"] sunburst_labels.append(t) sunburst_parents.append("All Ideas") sunburst_values.append(t_info["count"]) # Sub-bands type_ideas = [i for i in all_ideas if (i.get("idea_type") or "other") == t] for band, fn in novelty_bands.items(): cnt = sum(1 for i in type_ideas if fn(i.get("novelty_score"))) if cnt > 0: sunburst_labels.append(f"{t} - {band}") sunburst_parents.append(t) sunburst_values.append(cnt) return { "total": total, "scored": len(scored), "unscored": unscored, "avg_novelty": round(avg_novelty, 2), "embed_count": embed_count, "embed_pct": round(embed_count / total * 100, 1) if total > 0 else 0, "type_count": len(by_type), "novelty_histogram": novelty_histogram, "by_type": by_type, "top_novel": top_novel, "ideas_per_draft_hist": ideas_per_draft_hist, "top_idea_drafts": top_idea_drafts, "cross_tab": cross_tab, "sources": sources, "shared_ideas": shared_ideas, "scatter_data": scatter_data, "sunburst": { "labels": sunburst_labels, "parents": sunburst_parents, "values": sunburst_values, }, } def get_trends_data(db: Database) -> dict: """Return temporal evolution data for the /trends page. Returns dict with: - monthly_submissions: [{month, source, count}, ...] - monthly_ratings: [{month, novelty, maturity, overlap, momentum, relevance}, ...] - monthly_categories: [{month, category, count}, ...] - safety_ratio: [{month, safety, capability, ratio}, ...] - cumulative_ideas: [{month, total}, ...] - monthly_new_authors: [{month, count}, ...] - stats: {fastest_growing, newest_active} - monthly_table: [{month, total, sources: {}, avg_score}, ...] """ conn = db.conn # 1. Monthly submissions by source rows = conn.execute(""" SELECT substr(time, 1, 7) AS month, source, COUNT(*) AS cnt FROM drafts WHERE time IS NOT NULL AND time != '' GROUP BY month, source ORDER BY month """).fetchall() monthly_submissions = [{"month": r["month"], "source": r["source"], "count": r["cnt"]} for r in rows] # 2. Monthly average ratings (all 5 dimensions) rows = conn.execute(""" SELECT substr(d.time, 1, 7) AS month, AVG(r.novelty) AS novelty, AVG(r.maturity) AS maturity, AVG(r.overlap) AS overlap, AVG(r.momentum) AS momentum, AVG(r.relevance) AS relevance, COUNT(*) AS cnt FROM drafts d JOIN ratings r ON d.name = r.draft_name WHERE d.time IS NOT NULL AND d.time != '' AND r.false_positive = 0 GROUP BY month ORDER BY month """).fetchall() monthly_ratings = [{ "month": r["month"], "novelty": round(r["novelty"], 2), "maturity": round(r["maturity"], 2), "overlap": round(r["overlap"], 2), "momentum": round(r["momentum"], 2), "relevance": round(r["relevance"], 2), "count": r["cnt"], } for r in rows] # 3. Monthly category distribution rows = conn.execute(""" SELECT substr(d.time, 1, 7) AS month, r.categories FROM drafts d JOIN ratings r ON d.name = r.draft_name WHERE d.time IS NOT NULL AND d.time != '' AND r.false_positive = 0 """).fetchall() cat_monthly: dict[str, Counter] = defaultdict(Counter) all_cats: Counter = Counter() for r in rows: month = r["month"] try: cats = json.loads(r["categories"]) if r["categories"] else [] except (json.JSONDecodeError, TypeError): cats = [] for c in cats: cat_monthly[month][c] += 1 all_cats[c] += 1 # Top 8 categories top_cats = [c for c, _ in all_cats.most_common(8)] months_sorted = sorted(cat_monthly.keys()) monthly_categories = [] for month in months_sorted: for cat in top_cats: monthly_categories.append({ "month": month, "category": cat, "count": cat_monthly[month].get(cat, 0), }) # 4. Safety ratio over time safety_ratio = [] for month in months_sorted: safety = sum(cat_monthly[month].get(c, 0) for c in SAFETY_CATEGORIES) capability = sum(cat_monthly[month].get(c, 0) for c in CAPABILITY_CATEGORIES) ratio = round(safety / capability, 2) if capability > 0 else 0 safety_ratio.append({ "month": month, "safety": safety, "capability": capability, "ratio": ratio, }) # 5. Cumulative idea count over time rows = conn.execute(""" SELECT substr(d.time, 1, 7) AS month, COUNT(i.id) AS cnt FROM ideas i JOIN drafts d ON i.draft_name = d.name WHERE d.time IS NOT NULL AND d.time != '' GROUP BY month ORDER BY month """).fetchall() cumulative = 0 cumulative_ideas = [] for r in rows: cumulative += r["cnt"] cumulative_ideas.append({"month": r["month"], "total": cumulative}) # 6. Monthly new author count (first-time contributors) rows = conn.execute(""" SELECT da.person_id, MIN(substr(d.time, 1, 7)) AS first_month FROM draft_authors da JOIN drafts d ON da.draft_name = d.name WHERE d.time IS NOT NULL AND d.time != '' GROUP BY da.person_id """).fetchall() new_author_monthly: Counter = Counter() for r in rows: if r["first_month"]: new_author_monthly[r["first_month"]] += 1 monthly_new_authors = [ {"month": m, "count": new_author_monthly.get(m, 0)} for m in months_sorted ] # 7. Stats: fastest growing category, newest active category fastest_growing = "" newest_active = "" if len(months_sorted) >= 4: mid = len(months_sorted) // 2 early_months = months_sorted[:mid] late_months = months_sorted[mid:] best_growth = -999 for cat in top_cats: early = sum(cat_monthly[m].get(cat, 0) for m in early_months) late = sum(cat_monthly[m].get(cat, 0) for m in late_months) if early > 0: growth = (late - early) / early elif late > 0: growth = float("inf") else: growth = 0 if growth > best_growth: best_growth = growth fastest_growing = cat # Newest active: category with latest first appearance cat_first_month: dict[str, str] = {} for month in months_sorted: for cat in all_cats: if cat not in cat_first_month and cat_monthly[month].get(cat, 0) > 0: cat_first_month[cat] = month if cat_first_month: newest_active = max(cat_first_month, key=lambda c: cat_first_month[c]) # 8. Monthly breakdown table monthly_table = [] for month in months_sorted: # Get per-source counts sources: dict[str, int] = {} total = 0 for s in monthly_submissions: if s["month"] == month: sources[s["source"]] = s["count"] total += s["count"] # Get avg score avg_row = conn.execute(""" SELECT AVG((r.novelty + r.maturity + r.overlap + r.momentum + r.relevance) / 5.0) AS avg_score FROM drafts d JOIN ratings r ON d.name = r.draft_name WHERE substr(d.time, 1, 7) = ? AND r.false_positive = 0 """, (month,)).fetchone() avg_score = round(avg_row["avg_score"], 2) if avg_row and avg_row["avg_score"] else 0 monthly_table.append({ "month": month, "total": total, "sources": sources, "avg_score": avg_score, }) return { "monthly_submissions": monthly_submissions, "monthly_ratings": monthly_ratings, "monthly_categories": monthly_categories, "safety_ratio": safety_ratio, "cumulative_ideas": cumulative_ideas, "monthly_new_authors": monthly_new_authors, "top_categories": top_cats, "months": months_sorted, "stats": { "fastest_growing": fastest_growing, "newest_active": newest_active, }, "monthly_table": monthly_table, } def get_complexity_data(db: Database) -> dict: """Return draft complexity analysis data for the /complexity page. For each rated draft, compute structural complexity metrics and correlate with rating dimensions. Returns dict with: - drafts: [{name, title, pages, author_count, citation_count, idea_count, category_count, novelty, maturity, overlap, momentum, relevance, score, composite_complexity}, ...] - correlations: {metric: {dimension: r_value}} - top_complex: top 10 most complex drafts - top_efficient: top 10 high-rating low-complexity drafts - stats: {avg_pages, avg_authors, avg_citations, pages_coverage_pct} - category_complexity: [{category, avg_pages, avg_authors, avg_citations, count}, ...] - source_complexity: [{source, avg_pages, avg_authors, avg_citations, count}, ...] """ conn = db.conn # Build per-draft complexity data rows = conn.execute(""" SELECT d.name, d.title, d.pages, d.source, r.novelty, r.maturity, r.overlap, r.momentum, r.relevance, r.categories, (r.novelty + r.maturity + r.overlap + r.momentum + r.relevance) / 5.0 AS score FROM drafts d JOIN ratings r ON d.name = r.draft_name WHERE r.false_positive = 0 """).fetchall() # Author counts author_counts = db.draft_author_count_map() # Citation counts (outgoing refs) citation_counts = {} for row in conn.execute(""" SELECT draft_name, COUNT(*) AS cnt FROM draft_refs GROUP BY draft_name """).fetchall(): citation_counts[row["draft_name"]] = row["cnt"] # Idea counts idea_counts = {} for row in conn.execute(""" SELECT draft_name, COUNT(*) AS cnt FROM ideas GROUP BY draft_name """).fetchall(): idea_counts[row["draft_name"]] = row["cnt"] drafts_data = [] total_with_pages = 0 total_drafts = 0 for r in rows: total_drafts += 1 pages = r["pages"] if pages is not None: total_with_pages += 1 try: cats = json.loads(r["categories"]) if r["categories"] else [] except (json.JSONDecodeError, TypeError): cats = [] ac = author_counts.get(r["name"], 0) cc = citation_counts.get(r["name"], 0) ic = idea_counts.get(r["name"], 0) cat_count = len(cats) # Composite complexity: normalize each metric to 0-1 scale and average # (raw values stored; composite calculated after we know max values) drafts_data.append({ "name": r["name"], "title": r["title"], "pages": pages, "source": r["source"] or "ietf", "author_count": ac, "citation_count": cc, "idea_count": ic, "category_count": cat_count, "categories": cats, "novelty": r["novelty"], "maturity": r["maturity"], "overlap": r["overlap"], "momentum": r["momentum"], "relevance": r["relevance"], "score": round(r["score"], 2), }) # Compute composite complexity score (normalized 0-1 each, then averaged) max_pages = max((d["pages"] for d in drafts_data if d["pages"] is not None), default=1) or 1 max_authors = max((d["author_count"] for d in drafts_data), default=1) or 1 max_citations = max((d["citation_count"] for d in drafts_data), default=1) or 1 max_ideas = max((d["idea_count"] for d in drafts_data), default=1) or 1 for d in drafts_data: p = (d["pages"] / max_pages) if d["pages"] is not None else 0.3 # default to median-ish a = d["author_count"] / max_authors c = d["citation_count"] / max_citations i = d["idea_count"] / max_ideas d["composite_complexity"] = round((p + a + c + i) / 4, 3) # Correlation matrix: complexity metrics vs rating dimensions metrics = ["pages", "author_count", "citation_count", "idea_count", "category_count"] dimensions = ["novelty", "maturity", "overlap", "momentum", "relevance"] def _pearson(xs: list[float], ys: list[float]) -> float: """Compute Pearson correlation coefficient.""" n = len(xs) if n < 3: return 0.0 mean_x = sum(xs) / n mean_y = sum(ys) / n cov = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys)) std_x = (sum((x - mean_x) ** 2 for x in xs)) ** 0.5 std_y = (sum((y - mean_y) ** 2 for y in ys)) ** 0.5 if std_x == 0 or std_y == 0: return 0.0 return round(cov / (std_x * std_y), 3) correlations: dict[str, dict[str, float]] = {} for metric in metrics: correlations[metric] = {} for dim in dimensions: if metric == "pages": # Filter to drafts with pages data pairs = [(d[metric], d[dim]) for d in drafts_data if d[metric] is not None] else: pairs = [(d[metric], d[dim]) for d in drafts_data] if len(pairs) >= 3: xs, ys = zip(*pairs) correlations[metric][dim] = _pearson(list(xs), list(ys)) else: correlations[metric][dim] = 0.0 # Top 10 most complex sorted_by_complexity = sorted(drafts_data, key=lambda d: d["composite_complexity"], reverse=True) top_complex = sorted_by_complexity[:10] # Top 10 efficient: high score but low complexity # Efficiency = score / (composite_complexity + 0.1) (avoid div by zero) for d in drafts_data: d["efficiency"] = round(d["score"] / (d["composite_complexity"] + 0.1), 2) sorted_by_efficiency = sorted(drafts_data, key=lambda d: d["efficiency"], reverse=True) top_efficient = sorted_by_efficiency[:10] # Stats pages_vals = [d["pages"] for d in drafts_data if d["pages"] is not None] avg_pages = round(sum(pages_vals) / len(pages_vals), 1) if pages_vals else 0 avg_authors = round(sum(d["author_count"] for d in drafts_data) / len(drafts_data), 1) if drafts_data else 0 avg_citations = round(sum(d["citation_count"] for d in drafts_data) / len(drafts_data), 1) if drafts_data else 0 pages_coverage = round(total_with_pages / total_drafts * 100, 1) if total_drafts else 0 # Category complexity averages cat_data: dict[str, list[dict]] = defaultdict(list) for d in drafts_data: for cat in d.get("categories", []): cat_data[cat].append(d) category_complexity = [] for cat, ds in sorted(cat_data.items(), key=lambda x: -len(x[1])): p_vals = [d["pages"] for d in ds if d["pages"] is not None] category_complexity.append({ "category": cat, "avg_pages": round(sum(p_vals) / len(p_vals), 1) if p_vals else 0, "avg_authors": round(sum(d["author_count"] for d in ds) / len(ds), 1), "avg_citations": round(sum(d["citation_count"] for d in ds) / len(ds), 1), "avg_score": round(sum(d["score"] for d in ds) / len(ds), 2), "count": len(ds), }) # Source complexity source_data: dict[str, list[dict]] = defaultdict(list) for d in drafts_data: source_data[d["source"]].append(d) source_complexity = [] for src, ds in sorted(source_data.items(), key=lambda x: -len(x[1])): p_vals = [d["pages"] for d in ds if d["pages"] is not None] source_complexity.append({ "source": src, "avg_pages": round(sum(p_vals) / len(p_vals), 1) if p_vals else 0, "avg_authors": round(sum(d["author_count"] for d in ds) / len(ds), 1), "avg_citations": round(sum(d["citation_count"] for d in ds) / len(ds), 1), "avg_score": round(sum(d["score"] for d in ds) / len(ds), 2), "count": len(ds), }) return { "drafts": drafts_data, "correlations": correlations, "metrics": metrics, "dimensions": dimensions, "top_complex": top_complex, "top_efficient": top_efficient, "stats": { "avg_pages": avg_pages, "avg_authors": avg_authors, "avg_citations": avg_citations, "pages_coverage_pct": pages_coverage, "total_drafts": total_drafts, }, "category_complexity": category_complexity, "source_complexity": source_complexity, } def get_source_comparison(db: Database) -> dict: """Cross-source comparison: ratings, categories, counts by standards body.""" pairs_all = db.drafts_with_ratings(limit=2000) # Also include false positives for completeness of source counts pairs_fp = db.drafts_with_ratings(limit=2000, include_false_positives=True) # Build per-source data source_stats: dict[str, dict] = {} source_categories: dict[str, Counter] = defaultdict(Counter) source_ratings: dict[str, dict[str, list]] = defaultdict(lambda: { "novelty": [], "maturity": [], "overlap": [], "momentum": [], "relevance": [], "scores": [], }) # Collect author counts per source all_authors_by_source: dict[str, set] = defaultdict(set) for draft, rating in pairs_all: src = getattr(draft, "source", "ietf") or "ietf" source_ratings[src]["novelty"].append(rating.novelty) source_ratings[src]["maturity"].append(rating.maturity) source_ratings[src]["overlap"].append(rating.overlap) source_ratings[src]["momentum"].append(rating.momentum) source_ratings[src]["relevance"].append(rating.relevance) source_ratings[src]["scores"].append(round(rating.composite_score, 2)) for cat in rating.categories: source_categories[src][cat] += 1 # Get all drafts (including unrated) for draft counts all_drafts = db.list_drafts(limit=5000) source_draft_counts: Counter = Counter() for d in all_drafts: src = getattr(d, "source", "ietf") or "ietf" source_draft_counts[src] += 1 # Author counts by source try: rows = db.conn.execute( """SELECT d.source, COUNT(DISTINCT da.person_id) as author_count FROM drafts d JOIN draft_authors da ON d.name = da.draft_name GROUP BY d.source""" ).fetchall() for r in rows: src = r["source"] or "ietf" all_authors_by_source[src] = r["author_count"] except Exception: pass # Idea counts by source source_idea_counts: Counter = Counter() try: rows = db.conn.execute( """SELECT d.source, COUNT(*) as idea_count FROM ideas i JOIN drafts d ON i.draft_name = d.name GROUP BY d.source""" ).fetchall() for r in rows: src = r["source"] or "ietf" source_idea_counts[src] = r["idea_count"] except Exception: pass # Build summary table all_sources = sorted(set(source_draft_counts.keys()) | set(source_ratings.keys())) summary = [] for src in all_sources: rats = source_ratings.get(src, {"scores": []}) cats = source_categories.get(src, Counter()) top_cat = cats.most_common(1)[0][0] if cats else "N/A" avg_score = round(sum(rats["scores"]) / len(rats["scores"]), 2) if rats["scores"] else 0.0 summary.append({ "source": src, "drafts": source_draft_counts.get(src, 0), "rated": len(rats["scores"]), "authors": all_authors_by_source.get(src, 0), "ideas": source_idea_counts.get(src, 0), "avg_score": avg_score, "top_category": top_cat, }) # Radar data: average of each dimension per source radar = {} for src, rats in source_ratings.items(): if not rats["scores"]: continue n = len(rats["scores"]) radar[src] = { "novelty": round(sum(rats["novelty"]) / n, 2), "maturity": round(sum(rats["maturity"]) / n, 2), "overlap": round(sum(rats["overlap"]) / n, 2), "momentum": round(sum(rats["momentum"]) / n, 2), "relevance": round(sum(rats["relevance"]) / n, 2), "count": n, } # Category distribution by source (for stacked bar / heatmap) all_cats = sorted({cat for cats in source_categories.values() for cat in cats}) heatmap = { "sources": list(source_categories.keys()), "categories": all_cats, "values": [], } for src in heatmap["sources"]: row = [source_categories[src].get(cat, 0) for cat in all_cats] heatmap["values"].append(row) # Unique/shared categories analysis source_cat_sets = {src: set(cats.keys()) for src, cats in source_categories.items()} unique_cats = {} for src, cats in source_cat_sets.items(): others = set() for s2, c2 in source_cat_sets.items(): if s2 != src: others |= c2 unique_cats[src] = sorted(cats - others) shared_cats = set() for src, cats in source_cat_sets.items(): for s2, c2 in source_cat_sets.items(): if s2 != src: shared_cats |= (cats & c2) shared_cats = sorted(shared_cats) return { "summary": summary, "radar": radar, "heatmap": heatmap, "unique_categories": unique_cats, "shared_categories": shared_cats, } def get_citation_influence(db: Database) -> dict: """Return citation influence analysis data (cached for 5 min).""" return _cached("citation_influence", lambda: _compute_citation_influence(db)) def _compute_citation_influence(db: Database) -> dict: """Compute citation influence metrics from the draft_refs table. Returns dict with: - top_cited_rfcs: top 20 most-cited RFCs with citation counts and citing drafts - top_citing_drafts: top 20 drafts that cite the most references - citations_by_category: average citations per category - stats: total citations, unique RFCs, avg refs per draft - draft_network: draft-to-draft citation edges for visualization """ # Get all references rows = db.conn.execute( "SELECT draft_name, ref_type, ref_id FROM draft_refs" ).fetchall() # Get draft titles and categories draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() draft_titles = {r["name"]: r["title"] for r in draft_rows} rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() draft_cats: dict[str, str] = {} for r in rating_rows: try: cats = json.loads(r["categories"]) if r["categories"] else [] draft_cats[r["draft_name"]] = cats[0] if cats else "Other" except Exception: draft_cats[r["draft_name"]] = "Other" # Well-known RFC names rfc_names = { "2119": "Key words (MUST/SHALL/MAY)", "8174": "Key words update", "8259": "JSON", "7519": "JWT", "6749": "OAuth 2.0", "7540": "HTTP/2", "9110": "HTTP Semantics", "7525": "TLS Recommendations", "8446": "TLS 1.3", "3986": "URIs", "7230": "HTTP/1.1 Syntax", "7231": "HTTP/1.1 Semantics", "8288": "Web Linking", "6125": "TLS Server Identity", "7515": "JWS", "7516": "JWE", "7517": "JWK", "7518": "JWA", "9449": "DPoP", "6750": "OAuth Bearer", "8725": "JWT Best Practices", "9396": "Rich Authorization Requests", "9101": "JAR", "8414": "OAuth Server Metadata", "7591": "Dynamic Client Registration", "8705": "mTLS for OAuth", "9068": "JWT Access Tokens", "6819": "OAuth Threat Model", "9200": "ACE-OAuth", "9052": "COSE", "8392": "CWT", "7252": "CoAP", } # In-degree: how many times each RFC is cited rfc_citations: dict[str, list[str]] = defaultdict(list) draft_out_count: dict[str, int] = Counter() draft_to_draft_edges = [] total_citations = 0 for r in rows: draft_name = r["draft_name"] ref_type = r["ref_type"] ref_id = r["ref_id"] total_citations += 1 draft_out_count[draft_name] += 1 if ref_type == "rfc": rfc_citations[ref_id].append(draft_name) elif ref_type == "draft": draft_to_draft_edges.append({ "source": draft_name, "target": ref_id, "source_title": draft_titles.get(draft_name, draft_name), "target_title": draft_titles.get(ref_id, ref_id), }) # Top 20 most-cited RFCs rfc_sorted = sorted(rfc_citations.items(), key=lambda x: len(x[1]), reverse=True) top_cited_rfcs = [] for ref_id, citing_drafts in rfc_sorted[:20]: top_cited_rfcs.append({ "rfc_id": ref_id, "name": rfc_names.get(ref_id, ""), "count": len(citing_drafts), "drafts": citing_drafts[:10], # Limit to first 10 for display "total_drafts": len(citing_drafts), }) # Top 20 most-citing drafts (out-degree) draft_sorted = sorted(draft_out_count.items(), key=lambda x: x[1], reverse=True) top_citing_drafts = [] for draft_name, count in draft_sorted[:20]: top_citing_drafts.append({ "name": draft_name, "title": draft_titles.get(draft_name, draft_name), "count": count, "category": draft_cats.get(draft_name, "Other"), }) # Citation density by category cat_totals: dict[str, int] = Counter() cat_counts: dict[str, int] = Counter() for draft_name, count in draft_out_count.items(): cat = draft_cats.get(draft_name, "Other") cat_totals[cat] += count cat_counts[cat] += 1 citations_by_category = [] for cat in sorted(cat_totals.keys()): avg = cat_totals[cat] / cat_counts[cat] if cat_counts[cat] > 0 else 0 citations_by_category.append({ "category": cat, "total_citations": cat_totals[cat], "draft_count": cat_counts[cat], "avg_citations": round(avg, 1), }) citations_by_category.sort(key=lambda x: x["avg_citations"], reverse=True) # PageRank-style influence: drafts that cite highly-cited RFCs # Simple approximation: sum of (1 / citation_count) for each RFC cited rfc_influence = {rid: len(drafts) for rid, drafts in rfc_citations.items()} draft_pagerank: dict[str, float] = Counter() for r in rows: if r["ref_type"] == "rfc" and r["ref_id"] in rfc_influence: # Higher score for citing highly-cited RFCs draft_pagerank[r["draft_name"]] += rfc_influence[r["ref_id"]] pagerank_sorted = sorted(draft_pagerank.items(), key=lambda x: x[1], reverse=True) top_pagerank = [] for draft_name, score in pagerank_sorted[:20]: top_pagerank.append({ "name": draft_name, "title": draft_titles.get(draft_name, draft_name), "score": round(score, 1), "category": draft_cats.get(draft_name, "Other"), "out_degree": draft_out_count.get(draft_name, 0), }) # Stats unique_rfcs = len(rfc_citations) drafts_with_refs = len(draft_out_count) avg_refs = total_citations / drafts_with_refs if drafts_with_refs > 0 else 0 return { "top_cited_rfcs": top_cited_rfcs, "top_citing_drafts": top_citing_drafts, "top_pagerank": top_pagerank, "citations_by_category": citations_by_category, "draft_network": draft_to_draft_edges[:200], # Limit for perf "stats": { "total_citations": total_citations, "unique_rfcs": unique_rfcs, "drafts_with_refs": drafts_with_refs, "avg_refs_per_draft": round(avg_refs, 1), }, } def get_bcp_analysis(db: Database) -> dict: """Return BCP dependency analysis data (cached for 5 min).""" return _cached("bcp_analysis", lambda: _compute_bcp_analysis(db)) def _compute_bcp_analysis(db: Database) -> dict: """Compute BCP dependency analysis. Returns dict with: - bcps: all BCPs with citation counts and citing drafts - co_citation: which BCPs tend to be co-cited - by_category: BCP citation patterns by category - coverage: what % of drafts cite at least one BCP """ # Get all BCP references bcp_rows = db.conn.execute( "SELECT draft_name, ref_id FROM draft_refs WHERE ref_type = 'bcp'" ).fetchall() # Get draft titles and categories draft_rows = db.conn.execute("SELECT name, title FROM drafts").fetchall() draft_titles = {r["name"]: r["title"] for r in draft_rows} total_drafts = len(draft_titles) rating_rows = db.conn.execute("SELECT draft_name, categories FROM ratings").fetchall() draft_cats: dict[str, str] = {} for r in rating_rows: try: cats = json.loads(r["categories"]) if r["categories"] else [] draft_cats[r["draft_name"]] = cats[0] if cats else "Other" except Exception: draft_cats[r["draft_name"]] = "Other" # BCP citation counts bcp_citations: dict[str, list[str]] = defaultdict(list) draft_bcps: dict[str, list[str]] = defaultdict(list) for r in bcp_rows: bcp_citations[r["ref_id"]].append(r["draft_name"]) draft_bcps[r["draft_name"]].append(r["ref_id"]) # All BCPs with counts bcps = [] for bcp_id, citing_drafts in sorted(bcp_citations.items(), key=lambda x: len(x[1]), reverse=True): bcps.append({ "bcp_id": bcp_id, "count": len(citing_drafts), "drafts": citing_drafts[:10], "total_drafts": len(citing_drafts), }) # Co-citation matrix: which BCPs appear together in the same draft bcp_ids = sorted(bcp_citations.keys()) co_citation = [] for i, bcp_a in enumerate(bcp_ids): drafts_a = set(bcp_citations[bcp_a]) for j, bcp_b in enumerate(bcp_ids): if j <= i: continue drafts_b = set(bcp_citations[bcp_b]) shared = len(drafts_a & drafts_b) if shared > 0: co_citation.append({ "bcp_a": bcp_a, "bcp_b": bcp_b, "count": shared, }) # Heatmap data: full matrix for all BCPs (top 20 by citation count) top_bcp_ids = [b["bcp_id"] for b in bcps[:20]] heatmap_matrix = [] for bcp_a in top_bcp_ids: row = [] drafts_a = set(bcp_citations.get(bcp_a, [])) for bcp_b in top_bcp_ids: drafts_b = set(bcp_citations.get(bcp_b, [])) shared = len(drafts_a & drafts_b) row.append(shared) heatmap_matrix.append(row) # BCP citations by category cat_bcp_count: dict[str, Counter] = defaultdict(Counter) for draft_name, bcp_list in draft_bcps.items(): cat = draft_cats.get(draft_name, "Other") for bcp_id in bcp_list: cat_bcp_count[cat][bcp_id] += 1 by_category = [] for cat in sorted(cat_bcp_count.keys()): top_bcps = cat_bcp_count[cat].most_common(5) by_category.append({ "category": cat, "total_bcp_refs": sum(cat_bcp_count[cat].values()), "unique_bcps": len(cat_bcp_count[cat]), "top_bcps": [{"bcp_id": bid, "count": c} for bid, c in top_bcps], }) by_category.sort(key=lambda x: x["total_bcp_refs"], reverse=True) # Coverage drafts_with_bcp = len(draft_bcps) coverage_pct = (drafts_with_bcp / total_drafts * 100) if total_drafts > 0 else 0 return { "bcps": bcps, "co_citation": co_citation, "heatmap_labels": top_bcp_ids, "heatmap_matrix": heatmap_matrix, "by_category": by_category, "coverage": { "total_drafts": total_drafts, "drafts_with_bcp": drafts_with_bcp, "coverage_pct": round(coverage_pct, 1), "unique_bcps": len(bcp_citations), "total_bcp_refs": len(bcp_rows), }, }