Skip to content

Commit 6d15399

Browse files
committed
Create Clustering Analysis script
1 parent b6f696b commit 6d15399

File tree

2 files changed

+172
-0
lines changed

2 files changed

+172
-0
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import json
2+
import requests
3+
import networkx as nx
4+
from itertools import combinations
5+
from datetime import datetime
6+
import statistics
7+
import os
8+
9+
10+
def load_json_remote(url):
11+
"""Load JSON data from a remote URL."""
12+
response = requests.get(url)
13+
response.raise_for_status()
14+
return response.json()
15+
16+
17+
def find_field_combinations(obj):
18+
"""Recursively find sets of co-occurring JSON field names."""
19+
results = []
20+
if isinstance(obj, dict):
21+
keys = set(obj.keys())
22+
if len(keys) > 1:
23+
results.append(keys)
24+
for value in obj.values():
25+
results.extend(find_field_combinations(value))
26+
elif isinstance(obj, list):
27+
for item in obj:
28+
results.extend(find_field_combinations(item))
29+
return results
30+
31+
32+
def build_field_graph(data):
33+
"""Build a field co-occurrence graph."""
34+
G = nx.Graph()
35+
cooccurrence_sets = find_field_combinations(data)
36+
for field_set in cooccurrence_sets:
37+
for field in field_set:
38+
G.add_node(field)
39+
for u, v in combinations(field_set, 2):
40+
if G.has_edge(u, v):
41+
G[u][v]["weight"] += 1
42+
else:
43+
G.add_edge(u, v, weight=1)
44+
return G
45+
46+
47+
def clustering_analysis(G):
48+
"""Compute clustering coefficients."""
49+
local_clustering = nx.clustering(G, weight="weight")
50+
avg_clustering = nx.average_clustering(G, weight="weight")
51+
transitivity = nx.transitivity(G) # global measure
52+
return local_clustering, avg_clustering, transitivity
53+
54+
55+
def interpret_clustering(local_clustering, avg_clustering, transitivity):
56+
"""Generate interpretation narrative for clustering results."""
57+
sorted_nodes = sorted(local_clustering.items(), key=lambda x: x[1], reverse=True)
58+
top_nodes = [f"{k} ({v:.3f})" for k, v in sorted_nodes[:5]]
59+
60+
interpretation = [
61+
"## Interpretation of Clustering Results\n",
62+
"The clustering coefficient measures how likely a node’s neighbors "
63+
"are to also be connected to one another. High clustering suggests "
64+
"that related fields consistently appear together in the JSON structure, "
65+
"forming tightly interconnected groups.\n\n",
66+
f"### Global Measures\n"
67+
f"- **Average Clustering Coefficient:** {avg_clustering:.3f}\n"
68+
f"- **Network Transitivity:** {transitivity:.3f}\n\n",
69+
"### Fields with Highest Local Clustering\n",
70+
", ".join(top_nodes) + "\n\n",
71+
"_Interpretation:_\n"
72+
"If the **average clustering coefficient** is high (e.g., >0.5), "
73+
"it indicates that many JSON fields co-occur frequently, forming "
74+
"cohesive 'themes' or substructures (like `participants` + `summary` + `workgroups`). "
75+
"A **low value** (e.g., <0.2) would suggest a more modular or fragmented structure, "
76+
"where fields are grouped into separate contexts. "
77+
"Fields with **high local clustering** serve as 'cluster cores' — they often appear "
78+
"in tight-knit groups, while those with low clustering tend to bridge distinct sections."
79+
]
80+
return "\n".join(interpretation)
81+
82+
83+
def write_markdown_report(G, local_clustering, avg_clustering, transitivity, output_file):
84+
"""Write clustering results and interpretation to a Markdown file."""
85+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
86+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
87+
88+
with open(output_file, "w", encoding="utf-8") as f:
89+
f.write(f"# JSON Field Clustering Coefficient Report\n")
90+
f.write(f"**Generated on:** {timestamp}\n\n")
91+
f.write(f"- Total Fields (Nodes): {len(G.nodes)}\n")
92+
f.write(f"- Total Relationships (Edges): {len(G.edges)}\n\n")
93+
94+
f.write("## Local Clustering Coefficients (Top 10 Fields)\n")
95+
f.write("| Rank | Field | Clustering Coefficient |\n")
96+
f.write("|------|--------|-------------------------|\n")
97+
for i, (node, coeff) in enumerate(
98+
sorted(local_clustering.items(), key=lambda x: x[1], reverse=True)[:10], 1
99+
):
100+
f.write(f"| {i} | {node} | {coeff:.3f} |\n")
101+
f.write("\n")
102+
103+
interpretation = interpret_clustering(local_clustering, avg_clustering, transitivity)
104+
f.write(interpretation)
105+
f.write("\n")
106+
107+
print(f"✅ Clustering coefficient report saved to: {output_file}")
108+
109+
110+
def main():
111+
url = (
112+
"https://raw.githubusercontent.com/SingularityNET-Archive/"
113+
"SingularityNET-Archive/refs/heads/main/Data/Snet-Ambassador-Program/"
114+
"Meeting-Summaries/2025/meeting-summaries-array.json"
115+
)
116+
output_dir = "reports"
117+
os.makedirs(output_dir, exist_ok=True)
118+
output_file = os.path.join(output_dir, "clustering_analysis_report.md")
119+
120+
print("📡 Fetching JSON data...")
121+
data = load_json_remote(url)
122+
print("✅ JSON data successfully loaded.")
123+
124+
print("🔍 Building co-occurrence graph...")
125+
G = build_field_graph(data)
126+
print(f"📊 Graph contains {len(G.nodes)} fields and {len(G.edges)} edges.")
127+
128+
print("📈 Computing clustering coefficients...")
129+
local_clustering, avg_clustering, transitivity = clustering_analysis(G)
130+
131+
write_markdown_report(G, local_clustering, avg_clustering, transitivity, output_file)
132+
133+
134+
if __name__ == "__main__":
135+
main()
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# JSON Field Clustering Coefficient Report
2+
**Generated on:** 2025-10-15 08:51:41
3+
4+
- Total Fields (Nodes): 44
5+
- Total Relationships (Edges): 149
6+
7+
## Local Clustering Coefficients (Top 10 Fields)
8+
| Rank | Field | Clustering Coefficient |
9+
|------|--------|-------------------------|
10+
| 1 | text | 0.680 |
11+
| 2 | assignee | 0.674 |
12+
| 3 | dueDate | 0.652 |
13+
| 4 | workgroup_id | 0.297 |
14+
| 5 | meetingInfo | 0.297 |
15+
| 6 | canceledSummary | 0.297 |
16+
| 7 | type | 0.297 |
17+
| 8 | noSummaryGiven | 0.297 |
18+
| 9 | workgroup | 0.297 |
19+
| 10 | agendaItems | 0.291 |
20+
21+
## Interpretation of Clustering Results
22+
23+
The clustering coefficient measures how likely a node’s neighbors are to also be connected to one another. High clustering suggests that related fields consistently appear together in the JSON structure, forming tightly interconnected groups.
24+
25+
26+
### Global Measures
27+
- **Average Clustering Coefficient:** 0.164
28+
- **Network Transitivity:** 0.903
29+
30+
31+
### Fields with Highest Local Clustering
32+
33+
text (0.680), assignee (0.674), dueDate (0.652), workgroup_id (0.297), meetingInfo (0.297)
34+
35+
36+
_Interpretation:_
37+
If the **average clustering coefficient** is high (e.g., >0.5), it indicates that many JSON fields co-occur frequently, forming cohesive 'themes' or substructures (like `participants` + `summary` + `workgroups`). A **low value** (e.g., <0.2) would suggest a more modular or fragmented structure, where fields are grouped into separate contexts. Fields with **high local clustering** serve as 'cluster cores' — they often appear in tight-knit groups, while those with low clustering tend to bridge distinct sections.

0 commit comments

Comments
 (0)