Skip to content

Commit ac6e3b6

Browse files
committed
Create connected components script
1 parent 6d15399 commit ac6e3b6

File tree

2 files changed

+173
-0
lines changed

2 files changed

+173
-0
lines changed
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import json
2+
import requests
3+
import networkx as nx
4+
from itertools import combinations
5+
from datetime import datetime
6+
import os
7+
8+
9+
def load_json_remote(url):
10+
"""Load JSON data from a remote URL."""
11+
response = requests.get(url)
12+
response.raise_for_status()
13+
return response.json()
14+
15+
16+
def find_field_combinations(obj):
17+
"""Recursively find sets of co-occurring JSON field names."""
18+
results = []
19+
if isinstance(obj, dict):
20+
keys = set(obj.keys())
21+
if len(keys) > 1:
22+
results.append(keys)
23+
for value in obj.values():
24+
results.extend(find_field_combinations(value))
25+
elif isinstance(obj, list):
26+
for item in obj:
27+
results.extend(find_field_combinations(item))
28+
return results
29+
30+
31+
def build_field_graph(data):
32+
"""Build a field co-occurrence graph."""
33+
G = nx.Graph()
34+
cooccurrence_sets = find_field_combinations(data)
35+
for field_set in cooccurrence_sets:
36+
for field in field_set:
37+
G.add_node(field)
38+
for u, v in combinations(field_set, 2):
39+
if G.has_edge(u, v):
40+
G[u][v]["weight"] += 1
41+
else:
42+
G.add_edge(u, v, weight=1)
43+
return G
44+
45+
46+
def connected_components_analysis(G):
47+
"""Find all connected components (clusters) in the graph."""
48+
components = list(nx.connected_components(G))
49+
component_sizes = [len(c) for c in components]
50+
num_components = len(components)
51+
largest_component = max(component_sizes) if component_sizes else 0
52+
avg_size = sum(component_sizes) / len(component_sizes) if component_sizes else 0
53+
return components, num_components, largest_component, avg_size
54+
55+
56+
def interpret_connected_components(num_components, largest_component, avg_size):
57+
"""Interpretation narrative for connected components analysis."""
58+
interpretation = [
59+
"## Interpretation of Connected Components\n",
60+
"Connected components represent clusters of fields that are directly or indirectly linked — "
61+
"that is, they frequently co-occur in the same sections of the JSON structure.\n\n",
62+
f"- **Number of Components:** {num_components}\n"
63+
f"- **Largest Component Size:** {largest_component}\n"
64+
f"- **Average Component Size:** {avg_size:.2f}\n\n",
65+
"_Interpretation:_\n",
66+
"- A **small number of large components** suggests that many fields are interrelated, "
67+
"indicating a cohesive JSON schema.\n",
68+
"- A **large number of small components** implies that some parts of the data are isolated "
69+
"or used in specialized contexts.\n",
70+
"- The **largest component** can be viewed as the 'core schema' — the main structure tying most "
71+
"fields together.\n",
72+
]
73+
return "\n".join(interpretation)
74+
75+
76+
def write_markdown_report(G, components, num_components, largest_component, avg_size, output_file):
77+
"""Write connected component results and interpretation to a Markdown file."""
78+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
79+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
80+
81+
with open(output_file, "w", encoding="utf-8") as f:
82+
f.write(f"# JSON Connected Components Report\n")
83+
f.write(f"**Generated on:** {timestamp}\n\n")
84+
f.write(f"- Total Fields (Nodes): {len(G.nodes)}\n")
85+
f.write(f"- Total Relationships (Edges): {len(G.edges)}\n\n")
86+
87+
f.write("## Connected Components Summary\n")
88+
f.write(f"- Number of Components: {num_components}\n")
89+
f.write(f"- Largest Component Size: {largest_component}\n")
90+
f.write(f"- Average Component Size: {avg_size:.2f}\n\n")
91+
92+
f.write("## Top 5 Largest Components\n")
93+
for i, comp in enumerate(sorted(components, key=len, reverse=True)[:5], 1):
94+
f.write(f"### Component {i} ({len(comp)} fields)\n")
95+
f.write(", ".join(sorted(comp)) + "\n\n")
96+
97+
interpretation = interpret_connected_components(num_components, largest_component, avg_size)
98+
f.write(interpretation)
99+
100+
print(f"✅ Connected Components report saved to: {output_file}")
101+
102+
103+
def main():
104+
url = (
105+
"https://raw.githubusercontent.com/SingularityNET-Archive/"
106+
"SingularityNET-Archive/refs/heads/main/Data/Snet-Ambassador-Program/"
107+
"Meeting-Summaries/2025/meeting-summaries-array.json"
108+
)
109+
output_dir = "reports"
110+
os.makedirs(output_dir, exist_ok=True)
111+
output_file = os.path.join(output_dir, "connected_components_report.md")
112+
113+
print("📡 Fetching JSON data...")
114+
data = load_json_remote(url)
115+
print("✅ JSON data successfully loaded.")
116+
117+
print("🔍 Building field co-occurrence graph...")
118+
G = build_field_graph(data)
119+
print(f"📊 Graph contains {len(G.nodes)} fields and {len(G.edges)} edges.")
120+
121+
print("🔗 Identifying connected components...")
122+
components, num_components, largest_component, avg_size = connected_components_analysis(G)
123+
print(f"✅ Found {num_components} connected components.")
124+
125+
write_markdown_report(G, components, num_components, largest_component, avg_size, output_file)
126+
127+
128+
if __name__ == "__main__":
129+
main()
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# JSON Connected Components Report
2+
**Generated on:** 2025-10-15 08:55:46
3+
4+
- Total Fields (Nodes): 44
5+
- Total Relationships (Edges): 149
6+
7+
## Connected Components Summary
8+
- Number of Components: 6
9+
- Largest Component Size: 12
10+
- Average Component Size: 7.33
11+
12+
## Top 5 Largest Components
13+
### Component 1 (12 fields)
14+
date, documenter, host, mediaLink, meetingVideoLink, miroBoardLink, otherMediaLink, peoplePresent, purpose, timestampedVideo, typeOfMeeting, workingDocs
15+
16+
### Component 2 (12 fields)
17+
actionItems, agenda, assignee, decisionItems, discussion, discussionPoints, dueDate, gameRules, meetingTopics, narrative, status, text
18+
19+
### Component 3 (10 fields)
20+
agendaItems, canceledSummary, canceledSummaryText, meetingInfo, noSummaryGiven, noSummaryGivenText, tags, type, workgroup, workgroup_id
21+
22+
### Component 4 (4 fields)
23+
decision, effect, opposing, rationale
24+
25+
### Component 5 (4 fields)
26+
emotions, gamesPlayed, other, topicsCovered
27+
28+
## Interpretation of Connected Components
29+
30+
Connected components represent clusters of fields that are directly or indirectly linked — that is, they frequently co-occur in the same sections of the JSON structure.
31+
32+
33+
- **Number of Components:** 6
34+
- **Largest Component Size:** 12
35+
- **Average Component Size:** 7.33
36+
37+
38+
_Interpretation:_
39+
40+
- A **small number of large components** suggests that many fields are interrelated, indicating a cohesive JSON schema.
41+
42+
- A **large number of small components** implies that some parts of the data are isolated or used in specialized contexts.
43+
44+
- The **largest component** can be viewed as the 'core schema' — the main structure tying most fields together.

0 commit comments

Comments
 (0)