|
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import os |
| 4 | +from datetime import datetime |
| 5 | +from itertools import combinations |
| 6 | +from typing import Any, Dict, Iterable, List, Tuple |
| 7 | + |
| 8 | +import networkx as nx |
| 9 | +import requests |
| 10 | +from collections import Counter |
| 11 | + |
| 12 | + |
| 13 | +DEFAULT_INPUT = ( |
| 14 | + "https://raw.githubusercontent.com/SingularityNET-Archive/" |
| 15 | + "SingularityNET-Archive/refs/heads/main/Data/Snet-Ambassador-Program/" |
| 16 | + "Meeting-Summaries/2025/meeting-summaries-array.json" |
| 17 | +) |
| 18 | + |
| 19 | + |
| 20 | +def is_url(source: str) -> bool: |
| 21 | + return source.startswith("http://") or source.startswith("https://") |
| 22 | + |
| 23 | + |
| 24 | +def load_json(source: str) -> Any: |
| 25 | + if is_url(source): |
| 26 | + resp = requests.get(source) |
| 27 | + resp.raise_for_status() |
| 28 | + return resp.json() |
| 29 | + with open(source, "r", encoding="utf-8") as f: |
| 30 | + return json.load(f) |
| 31 | + |
| 32 | + |
| 33 | +# ---------------- Utility ---------------- |
| 34 | + |
| 35 | +def _truncate_label(text: str, max_len: int = 80) -> str: |
| 36 | + if text is None: |
| 37 | + return "" |
| 38 | + safe = str(text).replace("\n", " ").strip() |
| 39 | + return safe if len(safe) <= max_len else (safe[: max_len - 1] + "…") |
| 40 | + |
| 41 | + |
| 42 | +# ---------------- Degree (Co-attendance) ---------------- |
| 43 | + |
| 44 | +def extract_participants(record: Dict[str, Any]) -> List[str]: |
| 45 | + """Extract likely participants from a meeting record. |
| 46 | + - peoplePresent: comma-separated string under meetingInfo |
| 47 | + - host, documenter: added if present (deduped) |
| 48 | + """ |
| 49 | + participants: List[str] = [] |
| 50 | + meeting_info = {} |
| 51 | + if isinstance(record, dict): |
| 52 | + meeting_info = record.get("meetingInfo", {}) or {} |
| 53 | + # peoplePresent as comma-separated string |
| 54 | + pp = meeting_info.get("peoplePresent", "") |
| 55 | + if isinstance(pp, str) and pp.strip(): |
| 56 | + participants.extend([p.strip() for p in pp.split(",") if p.strip()]) |
| 57 | + # host/documenter as single names |
| 58 | + for key in ("host", "documenter"): |
| 59 | + val = meeting_info.get(key) |
| 60 | + if isinstance(val, str) and val.strip(): |
| 61 | + participants.append(val.strip()) |
| 62 | + # dedupe while preserving order |
| 63 | + seen = set() |
| 64 | + deduped: List[str] = [] |
| 65 | + for p in participants: |
| 66 | + if p not in seen: |
| 67 | + seen.add(p) |
| 68 | + deduped.append(p) |
| 69 | + return deduped |
| 70 | + |
| 71 | + |
| 72 | +def build_coattendance_graph(records: Iterable[Any]) -> nx.Graph: |
| 73 | + G = nx.Graph() |
| 74 | + for rec in records: |
| 75 | + participants = extract_participants(rec) |
| 76 | + if len(participants) < 2: |
| 77 | + continue |
| 78 | + for p in participants: |
| 79 | + G.add_node(p) |
| 80 | + for u, v in combinations(participants, 2): |
| 81 | + if G.has_edge(u, v): |
| 82 | + G[u][v]["weight"] += 1 |
| 83 | + else: |
| 84 | + G.add_edge(u, v, weight=1) |
| 85 | + return G |
| 86 | + |
| 87 | + |
| 88 | +def degree_analysis(G: nx.Graph) -> Tuple[Dict[str, int], Counter]: |
| 89 | + degree_dict = dict(G.degree()) |
| 90 | + degree_counts = Counter(degree_dict.values()) |
| 91 | + return degree_dict, degree_counts |
| 92 | + |
| 93 | + |
| 94 | +# ---------------- JSON Path Structure ---------------- |
| 95 | + |
| 96 | +def extract_json_paths(obj: Any, prefix: str = "") -> List[str]: |
| 97 | + paths: List[str] = [] |
| 98 | + if isinstance(obj, dict): |
| 99 | + for k, v in obj.items(): |
| 100 | + path = f"{prefix}.{k}" if prefix else k |
| 101 | + paths.append(path) |
| 102 | + paths.extend(extract_json_paths(v, path)) |
| 103 | + elif isinstance(obj, list): |
| 104 | + for i, item in enumerate(obj): |
| 105 | + path = f"{prefix}[{i}]" |
| 106 | + paths.append(path) |
| 107 | + paths.extend(extract_json_paths(item, path)) |
| 108 | + return paths |
| 109 | + |
| 110 | + |
| 111 | +def path_metrics(paths: List[str]) -> Dict[str, Any]: |
| 112 | + depths = [p.count(".") + p.count("[") for p in paths] |
| 113 | + max_depth = max(depths) if depths else 0 |
| 114 | + avg_depth = (sum(depths) / len(depths)) if depths else 0.0 |
| 115 | + deepest_paths = [p for p, d in zip(paths, depths) if d == max_depth] |
| 116 | + parent_counts = Counter([p.rsplit(".", 1)[0] if "." in p else p for p in paths]) |
| 117 | + return { |
| 118 | + "total_paths": len(paths), |
| 119 | + "max_depth": max_depth, |
| 120 | + "avg_depth": avg_depth, |
| 121 | + "deepest_paths": deepest_paths, |
| 122 | + "parent_counts": parent_counts, |
| 123 | + } |
| 124 | + |
| 125 | + |
| 126 | +def build_path_graph(paths: List[str]) -> nx.DiGraph: |
| 127 | + G = nx.DiGraph() |
| 128 | + for path in paths: |
| 129 | + if "." in path: |
| 130 | + parent = path.rsplit(".", 1)[0] |
| 131 | + G.add_edge(parent, path) |
| 132 | + elif "[" in path: |
| 133 | + parent = path.rsplit("[", 1)[0] |
| 134 | + G.add_edge(parent, path) |
| 135 | + else: |
| 136 | + G.add_node(path) |
| 137 | + return G |
| 138 | + |
| 139 | + |
| 140 | +# ---------------- Field Co-occurrence (Centrality, Clustering, Components) ---------------- |
| 141 | + |
| 142 | +def find_field_combinations(obj: Any) -> List[set]: |
| 143 | + results: List[set] = [] |
| 144 | + if isinstance(obj, dict): |
| 145 | + keys = set(obj.keys()) |
| 146 | + if len(keys) > 1: |
| 147 | + results.append(keys) |
| 148 | + for v in obj.values(): |
| 149 | + results.extend(find_field_combinations(v)) |
| 150 | + elif isinstance(obj, list): |
| 151 | + for item in obj: |
| 152 | + results.extend(find_field_combinations(item)) |
| 153 | + return results |
| 154 | + |
| 155 | + |
| 156 | +def build_field_graph(data: Any) -> nx.Graph: |
| 157 | + G = nx.Graph() |
| 158 | + sets = find_field_combinations(data) |
| 159 | + for s in sets: |
| 160 | + for k in s: |
| 161 | + G.add_node(k) |
| 162 | + for u, v in combinations(s, 2): |
| 163 | + if G.has_edge(u, v): |
| 164 | + G[u][v]["weight"] += 1 |
| 165 | + else: |
| 166 | + G.add_edge(u, v, weight=1) |
| 167 | + return G |
| 168 | + |
| 169 | + |
| 170 | +def compute_centrality_measures(G: nx.Graph) -> Dict[str, Dict[str, float]]: |
| 171 | + degree = nx.degree_centrality(G) if G.number_of_nodes() else {} |
| 172 | + betweenness = nx.betweenness_centrality(G) if G.number_of_nodes() else {} |
| 173 | + closeness = nx.closeness_centrality(G) if G.number_of_nodes() else {} |
| 174 | + try: |
| 175 | + eigenvector = nx.eigenvector_centrality(G, max_iter=1000) if G.number_of_nodes() else {} |
| 176 | + except nx.PowerIterationFailedConvergence: |
| 177 | + eigenvector = {n: 0.0 for n in G.nodes()} |
| 178 | + return { |
| 179 | + "degree": degree, |
| 180 | + "betweenness": betweenness, |
| 181 | + "closeness": closeness, |
| 182 | + "eigenvector": eigenvector, |
| 183 | + } |
| 184 | + |
| 185 | + |
| 186 | +def clustering_metrics(G: nx.Graph, top: int) -> Tuple[float, List[Tuple[str, float]]]: |
| 187 | + if G.number_of_nodes() == 0: |
| 188 | + return 0.0, [] |
| 189 | + avg = nx.average_clustering(G) |
| 190 | + per_node = nx.clustering(G) |
| 191 | + top_nodes = sorted(per_node.items(), key=lambda x: x[1], reverse=True)[:top] |
| 192 | + return avg, top_nodes |
| 193 | + |
| 194 | + |
| 195 | +def connected_components_info(G: nx.Graph, top: int) -> Dict[str, Any]: |
| 196 | + if G.number_of_nodes() == 0: |
| 197 | + return {"component_count": 0, "component_sizes": [], "largest_component_sample": []} |
| 198 | + components = sorted(nx.connected_components(G), key=len, reverse=True) |
| 199 | + sizes = [len(c) for c in components] |
| 200 | + largest = list(components[0]) if components else [] |
| 201 | + sample = largest[:top] |
| 202 | + return {"component_count": len(components), "component_sizes": sizes, "largest_component_sample": sample} |
| 203 | + |
| 204 | + |
| 205 | +# ---------------- Report Writer ---------------- |
| 206 | + |
| 207 | +def write_report( |
| 208 | + output_file: str, |
| 209 | + summary: Dict[str, Any], |
| 210 | + degree: Tuple[Dict[str, int], Counter], |
| 211 | + degree_top: List[Tuple[str, int]], |
| 212 | + degree_dist: List[Tuple[int, int]], |
| 213 | + path_info: Dict[str, Any], |
| 214 | + parent_top: List[Tuple[str, int]], |
| 215 | + centrality: Dict[str, Dict[str, float]], |
| 216 | + clustering: Tuple[float, List[Tuple[str, float]]], |
| 217 | + components: Dict[str, Any], |
| 218 | +) -> None: |
| 219 | + os.makedirs(os.path.dirname(output_file), exist_ok=True) |
| 220 | + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| 221 | + with open(output_file, "w", encoding="utf-8") as f: |
| 222 | + f.write("# Unified Graph Analysis Report\n") |
| 223 | + f.write(f"**Generated on:** {timestamp}\n\n") |
| 224 | + |
| 225 | + # Summary |
| 226 | + f.write("## Summary\n") |
| 227 | + for k, v in summary.items(): |
| 228 | + f.write(f"- {k}: {v}\n") |
| 229 | + f.write("\n") |
| 230 | + |
| 231 | + # Degree Analysis |
| 232 | + f.write("## Degree (Co-attendance) Analysis\n") |
| 233 | + f.write("### Top Nodes by Degree\n") |
| 234 | + f.write("| Rank | Node | Degree |\n|------|------|--------|\n") |
| 235 | + for i, (node, deg) in enumerate(degree_top, 1): |
| 236 | + label = _truncate_label(node, 80) |
| 237 | + f.write(f"| {i} | {label} | {deg} |\n") |
| 238 | + f.write("\n") |
| 239 | + f.write("### Degree Distribution\n") |
| 240 | + f.write("| Degree | Count of Nodes |\n|--------|-----------------|\n") |
| 241 | + for d, c in degree_dist: |
| 242 | + f.write(f"| {d} | {c} |\n") |
| 243 | + f.write("\n") |
| 244 | + |
| 245 | + # Path Analysis |
| 246 | + f.write("## JSON Path Structure Analysis\n") |
| 247 | + f.write(f"- Total Unique Paths: {path_info['total_paths']}\n") |
| 248 | + f.write(f"- Maximum Depth: {path_info['max_depth']}\n") |
| 249 | + f.write(f"- Average Depth: {path_info['avg_depth']:.2f}\n\n") |
| 250 | + f.write("### Deepest JSON Paths (sample)\n") |
| 251 | + for p in path_info["deepest_paths"][:10]: |
| 252 | + f.write(f"- `{p}`\n") |
| 253 | + f.write("\n") |
| 254 | + f.write("### Most Common Parent Paths\n") |
| 255 | + f.write("| Rank | Parent Path | Count |\n|------|-------------|-------|\n") |
| 256 | + for i, (parent, cnt) in enumerate(parent_top, 1): |
| 257 | + f.write(f"| {i} | `{parent}` | {cnt} |\n") |
| 258 | + f.write("\n") |
| 259 | + |
| 260 | + # Centrality |
| 261 | + f.write("## Field Centrality (Co-occurrence)\n") |
| 262 | + metrics = centrality |
| 263 | + top_fields = sorted(metrics["degree"].keys(), key=lambda x: metrics["degree"][x], reverse=True)[:10] |
| 264 | + f.write("| Rank | Field | Degree | Betweenness | Closeness | Eigenvector |\n") |
| 265 | + f.write("|------|-------|--------|-------------|-----------|------------|\n") |
| 266 | + for i, node in enumerate(top_fields, 1): |
| 267 | + f.write( |
| 268 | + f"| {i} | {node} | " |
| 269 | + f"{metrics['degree'].get(node, 0):.3f} | " |
| 270 | + f"{metrics['betweenness'].get(node, 0):.3f} | " |
| 271 | + f"{metrics['closeness'].get(node, 0):.3f} | " |
| 272 | + f"{metrics['eigenvector'].get(node, 0):.3f} |\n" |
| 273 | + ) |
| 274 | + f.write("\n") |
| 275 | + |
| 276 | + # Clustering |
| 277 | + avg_clust, top_clust_nodes = clustering |
| 278 | + f.write("## Clustering (Field Co-occurrence Graph)\n") |
| 279 | + f.write(f"- Average Clustering Coefficient: {avg_clust:.3f}\n\n") |
| 280 | + f.write("### Top Nodes by Clustering Coefficient\n") |
| 281 | + f.write("| Rank | Field | Clustering |\n|------|-------|------------|\n") |
| 282 | + for i, (node, val) in enumerate(top_clust_nodes, 1): |
| 283 | + f.write(f"| {i} | {node} | {val:.3f} |\n") |
| 284 | + f.write("\n") |
| 285 | + |
| 286 | + # Connected Components |
| 287 | + f.write("## Connected Components (Field Co-occurrence Graph)\n") |
| 288 | + f.write(f"- Number of Components: {components['component_count']}\n") |
| 289 | + f.write(f"- Component Sizes (top 10): {components['component_sizes'][:10]}\n") |
| 290 | + f.write("- Sample of Largest Component Nodes (top 10):\n") |
| 291 | + for n in components["largest_component_sample"][:10]: |
| 292 | + f.write(f" - {n}\n") |
| 293 | + f.write("\n") |
| 294 | + |
| 295 | + |
| 296 | +def ensure_iterable_records(data: Any) -> List[Any]: |
| 297 | + if isinstance(data, list): |
| 298 | + return data |
| 299 | + if isinstance(data, dict): |
| 300 | + return [data] |
| 301 | + return [] |
| 302 | + |
| 303 | + |
| 304 | +def main() -> None: |
| 305 | + parser = argparse.ArgumentParser(description="Unified Graph Analysis") |
| 306 | + parser.add_argument( |
| 307 | + "--input", |
| 308 | + default=DEFAULT_INPUT, |
| 309 | + help="Local JSON file path or HTTP(S) URL", |
| 310 | + ) |
| 311 | + parser.add_argument( |
| 312 | + "--output", |
| 313 | + default="reports/unified_analysis_report.md", |
| 314 | + help="Markdown report output path", |
| 315 | + ) |
| 316 | + parser.add_argument( |
| 317 | + "--limit-top", |
| 318 | + type=int, |
| 319 | + default=10, |
| 320 | + help="Top-N rows to include in tables", |
| 321 | + ) |
| 322 | + args = parser.parse_args() |
| 323 | + |
| 324 | + data = load_json(args.input) |
| 325 | + records = ensure_iterable_records(data) |
| 326 | + |
| 327 | + # Degree / co-attendance graph (participants-only) |
| 328 | + G_attend = build_coattendance_graph(records) |
| 329 | + degree_dict, degree_counts = degree_analysis(G_attend) |
| 330 | + degree_top = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[: args.limit_top] |
| 331 | + degree_dist = sorted(degree_counts.items(), key=lambda x: x[0]) |
| 332 | + |
| 333 | + # Path analysis |
| 334 | + all_paths = extract_json_paths(data) |
| 335 | + pmetrics = path_metrics(all_paths) |
| 336 | + parent_top = pmetrics["parent_counts"].most_common(args.limit_top) |
| 337 | + G_paths = build_path_graph(all_paths) |
| 338 | + |
| 339 | + # Field co-occurrence graph |
| 340 | + G_fields = build_field_graph(data) |
| 341 | + centrality = compute_centrality_measures(G_fields) |
| 342 | + |
| 343 | + # Clustering & components on field graph |
| 344 | + avg_clust, top_clust_nodes = clustering_metrics(G_fields, args.limit_top) |
| 345 | + components = connected_components_info(G_fields, args.limit_top) |
| 346 | + |
| 347 | + summary = { |
| 348 | + "Co-attendance graph (nodes)": len(G_attend.nodes), |
| 349 | + "Co-attendance graph (edges)": len(G_attend.edges), |
| 350 | + "Path graph (nodes)": len(G_paths.nodes), |
| 351 | + "Path graph (edges)": len(G_paths.edges), |
| 352 | + "Field graph (nodes)": len(G_fields.nodes), |
| 353 | + "Field graph (edges)": len(G_fields.edges), |
| 354 | + } |
| 355 | + |
| 356 | + write_report( |
| 357 | + output_file=args.output, |
| 358 | + summary=summary, |
| 359 | + degree=(degree_dict, degree_counts), |
| 360 | + degree_top=degree_top, |
| 361 | + degree_dist=degree_dist, |
| 362 | + path_info=pmetrics, |
| 363 | + parent_top=parent_top, |
| 364 | + centrality=centrality, |
| 365 | + clustering=(avg_clust, top_clust_nodes), |
| 366 | + components=components, |
| 367 | + ) |
| 368 | + print(f"✅ Unified report written to: {args.output}") |
| 369 | + |
| 370 | + |
| 371 | +if __name__ == "__main__": |
| 372 | + main() |
0 commit comments