MLT-OSS · ningzimu · Feb 25, 2026 · Feb 25, 2026
diff --git a/.github/workflows/update-indexes.yml b/.github/workflows/update-indexes.yml
@@ -27,30 +27,7 @@ jobs:
             --schemafile firstdata/schemas/datasource-schema.json
 
       - name: Check for duplicate IDs
-        run: |
-          uv run python - <<'EOF'
-          import json, sys
-          from pathlib import Path
-
-          seen = {}
-          errors = []
-
-          for path in sorted(Path("firstdata/sources").rglob("*.json")):
-              data = json.loads(path.read_text(encoding="utf-8"))
-              id_ = data.get("id")
-              if id_ in seen:
-                  errors.append(f"Duplicate id '{id_}' in:\n  {seen[id_]}\n  {path}")
-              else:
-                  seen[id_] = path
-
-          if errors:
-              print("❌ Duplicate IDs found:")
-              for e in errors:
-                  print(e)
-              sys.exit(1)
-
-          print(f"✅ All {len(seen)} IDs are unique.")
-          EOF
+        run: uv run python scripts/check_ids.py
 
       - name: Rebuild indexes
         run: uv run python scripts/build_indexes.py

diff --git a/.github/workflows/validate-sources.yml b/.github/workflows/validate-sources.yml
@@ -40,27 +40,4 @@ jobs:
             --schemafile firstdata/schemas/datasource-schema.json
 
       - name: Check for duplicate IDs
-        run: |
-          uv run python - <<'EOF'
-          import json, sys
-          from pathlib import Path
-
-          seen = {}
-          errors = []
-
-          for path in sorted(Path("firstdata/sources").rglob("*.json")):
-              data = json.loads(path.read_text(encoding="utf-8"))
-              id_ = data.get("id")
-              if id_ in seen:
-                  errors.append(f"Duplicate id '{id_}' in:\n  {seen[id_]}\n  {path}")
-              else:
-                  seen[id_] = path
-
-          if errors:
-              print("❌ Duplicate IDs found:")
-              for e in errors:
-                  print(e)
-              sys.exit(1)
-
-          print(f"✅ All {len(seen)} IDs are unique.")
-          EOF
+        run: uv run python scripts/check_ids.py
diff --git a/Makefile b/Makefile
@@ -0,0 +1,24 @@
+.PHONY: validate check-ids check build-indexes help
+
+help:
+	@echo "Usage:"
+	@echo "  make validate       Validate all source JSON files against the schema"
+	@echo "  make check-ids      Check for duplicate IDs across all source files"
+	@echo "  make check          Run all checks (validate + check-ids)"
+	@echo "  make build-indexes  Rebuild all index and badge files"
+
+validate:
+	@echo "Validating source JSON files..."
+	@find firstdata/sources -name "*.json" | xargs uv run check-jsonschema \
+		--schemafile firstdata/schemas/datasource-schema.json
+	@echo "✅ All files are valid."
+
+check-ids:
+	@echo "Checking for duplicate IDs..."
+	@uv run python scripts/check_ids.py
+
+check: validate check-ids
+
+build-indexes:
+	@echo "Building indexes and badges..."
+	@uv run python scripts/build_indexes.py
diff --git a/firstdata/sources/academic/economics/bis-statistics.json b/firstdata/sources/academic/economics/bis-statistics.json
diff --git a/firstdata/sources/sectors/education/arwu.json b/firstdata/sources/sectors/education/arwu.json
diff --git a/scripts/check_ids.py b/scripts/check_ids.py
@@ -0,0 +1,32 @@
+"""Check for duplicate IDs across all source JSON files."""
+
+import json
+import sys
+from pathlib import Path
+
+SOURCES_DIR = Path(__file__).parent.parent / "firstdata" / "sources"
+
+
+def main() -> None:
+    seen: dict[str, Path] = {}
+    errors: list[str] = []
+
+    for path in sorted(SOURCES_DIR.rglob("*.json")):
+        data = json.loads(path.read_text(encoding="utf-8"))
+        id_ = data.get("id")
+        if id_ in seen:
+            errors.append(f"Duplicate id '{id_}' in:\n  {seen[id_]}\n  {path}")
+        else:
+            seen[id_] = path
+
+    if errors:
+        print("❌ Duplicate IDs found:")
+        for e in errors:
+            print(e)
+        sys.exit(1)
+
+    print(f"✅ All {len(seen)} IDs are unique.")
+
+
+if __name__ == "__main__":
+    main()