From 2744d885dd9e37e9ab57c7b98115b34905338aaf Mon Sep 17 00:00:00 2001
From: ningzimu <619883006@qq.com>
Date: Wed, 25 Feb 2026 19:13:08 +0800
Subject: [PATCH] feat(tooling): add Makefile and extract check_ids script

- add Makefile with validate, check-ids, check, build-indexes targets
- extract duplicate ID check into scripts/check_ids.py for reuse
- simplify CI workflows to call scripts/check_ids.py instead of inline heredoc
- remove duplicate source files: bis-statistics.json and sectors/education/arwu.json
---
 .github/workflows/update-indexes.yml          |  25 +---
 .github/workflows/validate-sources.yml        |  25 +---
 Makefile                                      |  24 ++++
 .../academic/economics/bis-statistics.json    |  76 ------------
 firstdata/sources/sectors/education/arwu.json | 109 ------------------
 scripts/check_ids.py                          |  32 +++++
 6 files changed, 58 insertions(+), 233 deletions(-)
 create mode 100644 Makefile
 delete mode 100644 firstdata/sources/academic/economics/bis-statistics.json
 delete mode 100644 firstdata/sources/sectors/education/arwu.json
 create mode 100644 scripts/check_ids.py

diff --git a/.github/workflows/update-indexes.yml b/.github/workflows/update-indexes.yml
index 422c528..e391868 100644
--- a/.github/workflows/update-indexes.yml
+++ b/.github/workflows/update-indexes.yml
@@ -27,30 +27,7 @@ jobs:
             --schemafile firstdata/schemas/datasource-schema.json
 
       - name: Check for duplicate IDs
-        run: |
-          uv run python - <<'EOF'
-          import json, sys
-          from pathlib import Path
-
-          seen = {}
-          errors = []
-
-          for path in sorted(Path("firstdata/sources").rglob("*.json")):
-              data = json.loads(path.read_text(encoding="utf-8"))
-              id_ = data.get("id")
-              if id_ in seen:
-                  errors.append(f"Duplicate id '{id_}' in:\n  {seen[id_]}\n  {path}")
-              else:
-                  seen[id_] = path
-
-          if errors:
-              print("❌ Duplicate IDs found:")
-              for e in errors:
-                  print(e)
-              sys.exit(1)
-
-          print(f"✅ All {len(seen)} IDs are unique.")
-          EOF
+        run: uv run python scripts/check_ids.py
 
       - name: Rebuild indexes
         run: uv run python scripts/build_indexes.py
diff --git a/.github/workflows/validate-sources.yml b/.github/workflows/validate-sources.yml
index 0264944..e346850 100644
--- a/.github/workflows/validate-sources.yml
+++ b/.github/workflows/validate-sources.yml
@@ -40,27 +40,4 @@ jobs:
             --schemafile firstdata/schemas/datasource-schema.json
 
       - name: Check for duplicate IDs
-        run: |
-          uv run python - <<'EOF'
-          import json, sys
-          from pathlib import Path
-
-          seen = {}
-          errors = []
-
-          for path in sorted(Path("firstdata/sources").rglob("*.json")):
-              data = json.loads(path.read_text(encoding="utf-8"))
-              id_ = data.get("id")
-              if id_ in seen:
-                  errors.append(f"Duplicate id '{id_}' in:\n  {seen[id_]}\n  {path}")
-              else:
-                  seen[id_] = path
-
-          if errors:
-              print("❌ Duplicate IDs found:")
-              for e in errors:
-                  print(e)
-              sys.exit(1)
-
-          print(f"✅ All {len(seen)} IDs are unique.")
-          EOF
+        run: uv run python scripts/check_ids.py
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..d1fcfc6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,24 @@
+.PHONY: validate check-ids check build-indexes help
+
+help:
+	@echo "Usage:"
+	@echo "  make validate       Validate all source JSON files against the schema"
+	@echo "  make check-ids      Check for duplicate IDs across all source files"
+	@echo "  make check          Run all checks (validate + check-ids)"
+	@echo "  make build-indexes  Rebuild all index and badge files"
+
+validate:
+	@echo "Validating source JSON files..."
+	@find firstdata/sources -name "*.json" | xargs uv run check-jsonschema \
+		--schemafile firstdata/schemas/datasource-schema.json
+	@echo "✅ All files are valid."
+
+check-ids:
+	@echo "Checking for duplicate IDs..."
+	@uv run python scripts/check_ids.py
+
+check: validate check-ids
+
+build-indexes:
+	@echo "Building indexes and badges..."
+	@uv run python scripts/build_indexes.py
diff --git a/firstdata/sources/academic/economics/bis-statistics.json b/firstdata/sources/academic/economics/bis-statistics.json
deleted file mode 100644
index 92e0bcf..0000000
--- a/firstdata/sources/academic/economics/bis-statistics.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "id": "bis-statistics",
-  "name": {
-    "en": "BIS Statistics",
-    "zh": "国际清算银行统计数据"
-  },
-  "description": {
-    "en": "BIS statistics, compiled in cooperation with central banks and other national authorities, are designed to inform analysis of financial stability, international monetary spillovers and global liquidity. The BIS provides comprehensive data on international banking activity, debt securities, credit, derivatives, exchange rates, property prices, consumer prices, global liquidity, and payment statistics. Data covers activities in over 40 countries and is widely used by central banks, financial institutions, researchers, and policymakers for monetary and financial stability analysis.",
-    "zh": "国际清算银行（BIS）统计数据与各国中央银行及其他国家机构合作编制，旨在为金融稳定、国际货币溢出效应和全球流动性分析提供信息支持。BIS 提供全面的国际银行业务、债务证券、信贷、衍生品、汇率、房地产价格、消费者价格、全球流动性和支付统计数据。数据涵盖 40 多个国家的活动，广泛用于中央银行、金融机构、研究人员和政策制定者进行货币和金融稳定性分析。"
-  },
-  "website": "https://www.bis.org/",
-  "data_url": "https://data.bis.org/",
-  "api_url": "https://stats.bis.org/api-doc/v2/",
-  "country": null,
-  "domains": [
-    "International Banking",
-    "Debt Securities",
-    "Credit Markets",
-    "Foreign Exchange",
-    "Derivatives",
-    "Property Prices",
-    "Consumer Prices",
-    "Global Liquidity",
-    "Payment Systems",
-    "Central Bank Statistics"
-  ],
-  "geographic_scope": "global",
-  "update_frequency": "quarterly",
-  "tags": [
-    "international-banking",
-    "central-bank",
-    "financial-stability",
-    "global-liquidity",
-    "debt-securities",
-    "derivatives",
-    "exchange-rates",
-    "property-prices",
-    "payment-systems",
-    "monetary-policy",
-    "macroprudential",
-    "cross-border-flows",
-    "time-series",
-    "open-access",
-    "api",
-    "sdmx"
-  ],
-  "data_content": {
-    "en": [
-      "Locational Banking Statistics - International banking activity from a residence perspective",
-      "Consolidated Banking Statistics - Worldwide positions of internationally active banking groups",
-      "Debt Securities Statistics - International and domestic debt securities issuance and amounts outstanding",
-      "Credit to the Non-Financial Sector - Borrowing by government and private non-financial sectors",
-      "Global Liquidity Indicators - Foreign currency credit to non-residents in major currencies",
-      "Derivatives Statistics - OTC derivatives market activity and turnover (Triennial Survey)",
-      "Effective Exchange Rates - Nominal and real effective exchange rate indices",
-      "Residential Property Prices - Selected residential and commercial property price statistics",
-      "Consumer Price Indices - Consumer price inflation across countries",
-      "Central Bank Total Assets - Evolution of central bank balance sheets",
-      "Payment Statistics - Comparative payment statistics including card payments, terminals, and cashless transactions"
-    ],
-    "zh": [
-      "地域性银行统计 - 从居住地角度衡量的国际银行业务活动",
-      "合并银行统计 - 总部设在报告国的国际活跃银行集团的全球综合头寸",
-      "债务证券统计 - 国际和国内债务证券发行及未偿金额",
-      "非金融部门信贷 - 政府和私人非金融部门的借贷活动",
-      "全球流动性指标 - 主要货币向非居民提供的外币信贷",
-      "衍生品统计 - 场外衍生品市场活动和交易量（三年一次调查）",
-      "有效汇率 - 名义和实际有效汇率指数",
-      "住宅房地产价格 - 选定的住宅和商业房地产价格统计",
-      "消费者价格指数 - 各国消费者价格通胀",
-      "中央银行总资产 - 中央银行资产负债表规模的演变",
-      "支付统计 - 包括卡支付、终端和无现金交易在内的比较支付统计"
-    ]
-  },
-  "authority_level": "government"
-}
\ No newline at end of file
diff --git a/firstdata/sources/sectors/education/arwu.json b/firstdata/sources/sectors/education/arwu.json
deleted file mode 100644
index 3c72347..0000000
--- a/firstdata/sources/sectors/education/arwu.json
+++ /dev/null
@@ -1,109 +0,0 @@
-{
-  "id": "arwu",
-  "name": {
-    "en": "Academic Ranking of World Universities",
-    "zh": "世界大学学术排名"
-  },
-  "description": {
-    "en": "ARWU ranks world universities by academic and research performance indicators including Nobel Prizes, Fields Medals, highly cited researchers, and publications in Nature and Science. More than 2500 universities are ranked annually with the top 1000 published.",
-    "zh": "ARWU通过学术和研究绩效指标对世界大学进行排名，包括诺贝尔奖、菲尔兹奖、高被引研究者以及在Nature和Science上的发表。每年对2500多所大学进行排名，发布前1000名。"
-  },
-  "organization": {
-    "name": {
-      "en": "ShanghaiRanking Consultancy",
-      "zh": "软科排名咨询公司"
-    },
-    "type": "research",
-    "country": "CN",
-    "website": "https://www.shanghairanking.com/"
-  },
-  "website": "https://www.shanghairanking.com/",
-  "data_url": "https://www.shanghairanking.com/rankings/arwu/2025",
-  "authority_level": "research",
-  "country": null,
-  "domains": [
-    "higher education",
-    "university rankings",
-    "academic research",
-    "research performance"
-  ],
-  "geographic_scope": "global",
-  "update_frequency": "annually",
-  "data_content": [
-    {
-      "en": "World university rankings (top 1000)",
-      "zh": "世界大学排名（前1000名）"
-    },
-    {
-      "en": "Alumni and staff Nobel Prizes and Fields Medals",
-      "zh": "校友和教职员工诺贝尔奖和菲尔兹奖获得者"
-    },
-    {
-      "en": "Highly Cited Researchers by Clarivate Analytics",
-      "zh": "科睿唯安高被引研究者"
-    },
-    {
-      "en": "Publications in Nature and Science journals",
-      "zh": "Nature和Science期刊发表论文"
-    },
-    {
-      "en": "Papers indexed in SCIE and SSCI",
-      "zh": "SCIE和SSCI索引论文"
-    },
-    {
-      "en": "Per capita academic performance indicators",
-      "zh": "人均学术绩效指标"
-    },
-    {
-      "en": "Rankings by subject and region",
-      "zh": "学科和地区排名"
-    }
-  ],
-  "data_formats": [
-    "web",
-    "pdf"
-  ],
-  "license": {
-    "type": "copyright",
-    "restrictions": "Copyrighted by ShanghaiRanking Consultancy, free to view online",
-    "commercial_use": false
-  },
-  "tags": {
-    "en": [
-      "university rankings",
-      "ARWU",
-      "Shanghai Ranking",
-      "academic performance",
-      "research metrics",
-      "higher education",
-      "world universities",
-      "Nobel Prize",
-      "Fields Medal",
-      "citation analysis",
-      "research excellence",
-      "global ranking"
-    ],
-    "zh": [
-      "大学排名",
-      "世界大学学术排名",
-      "软科排名",
-      "学术表现",
-      "研究指标",
-      "高等教育",
-      "世界大学",
-      "诺贝尔奖",
-      "菲尔兹奖",
-      "引用分析",
-      "研究卓越",
-      "全球排名"
-    ]
-  },
-  "first_published": "2003",
-  "update_history": {
-    "latest_update": "2025",
-    "note": "Published annually since 2003, independently operated by ShanghaiRanking Consultancy since 2009"
-  },
-  "contact": {
-    "email": "pr@shanghairanking.com"
-  }
-}
diff --git a/scripts/check_ids.py b/scripts/check_ids.py
new file mode 100644
index 0000000..b64bd08
--- /dev/null
+++ b/scripts/check_ids.py
@@ -0,0 +1,32 @@
+"""Check for duplicate IDs across all source JSON files."""
+
+import json
+import sys
+from pathlib import Path
+
+SOURCES_DIR = Path(__file__).parent.parent / "firstdata" / "sources"
+
+
+def main() -> None:
+    seen: dict[str, Path] = {}
+    errors: list[str] = []
+
+    for path in sorted(SOURCES_DIR.rglob("*.json")):
+        data = json.loads(path.read_text(encoding="utf-8"))
+        id_ = data.get("id")
+        if id_ in seen:
+            errors.append(f"Duplicate id '{id_}' in:\n  {seen[id_]}\n  {path}")
+        else:
+            seen[id_] = path
+
+    if errors:
+        print("❌ Duplicate IDs found:")
+        for e in errors:
+            print(e)
+        sys.exit(1)
+
+    print(f"✅ All {len(seen)} IDs are unique.")
+
+
+if __name__ == "__main__":
+    main()