From 78d8c0c3952ba8d325c586ca5d0cfb14533c111b Mon Sep 17 00:00:00 2001 From: Anthony Boyd <92742765+aboydnw@users.noreply.github.com> Date: Thu, 26 Feb 2026 18:36:10 -0500 Subject: [PATCH 1/2] data: add tier to links, forking orgs to repos, community discovery - Link model: add tier field (core/community) - Repository model: rename devseed_contributors to core_contributors, add forking_organizations field - Client: tier-aware update_links, discover_community_contributors, update_community_links, discover/update forking orgs Made-with: Cursor --- python/contributor_network/client.py | 109 ++++++++++++++++++++++++--- python/contributor_network/models.py | 19 +++-- 2 files changed, 110 insertions(+), 18 deletions(-) diff --git a/python/contributor_network/client.py b/python/contributor_network/client.py index a20aacd..abc0aeb 100644 --- a/python/contributor_network/client.py +++ b/python/contributor_network/client.py @@ -2,6 +2,7 @@ from github import Github from github.Auth import Auth +from github.GithubException import GithubException, RateLimitExceededException from github.NamedUser import NamedUser from github.Repository import Repository as Repo @@ -26,29 +27,115 @@ def update_repository(self, repo: Repo) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(repository.model_dump_json()) - def update_links(self, repo: Repo, contributors: dict[str, str]) -> None: - """Update the links for a single repository.""" - devseed_count = 0 + def update_links( + self, + repo: Repo, + contributors: dict[str, str], + *, + core_usernames: set[str] | None = None, + ) -> None: + """Update the links for a single repository. + + Args: + repo: GitHub repository object + contributors: Map of username -> display name for known contributors + core_usernames: Set of usernames classified as "core". + If provided, links are tagged with tier="core" or "community". + If None, all links default to tier="core". + """ + core_count = 0 for contributor in repo.get_contributors(): if contributor_name := contributors.get(contributor.login): - self.update_link(repo, contributor, contributor_name) - devseed_count += 1 + tier = "core" + if core_usernames is not None: + tier = ( + "core" if contributor.login in core_usernames else "community" + ) + self.update_link(repo, contributor, contributor_name, tier=tier) + core_count += 1 + + self.update_repository_community_stats(repo.full_name, core_count) - # Update repository with community stats (Phase 2) - self.update_repository_community_stats(repo.full_name, devseed_count) + def discover_community_contributors( + self, + repo: Repo, + known_usernames: set[str], + *, + max_community: int = 100, + ) -> list[tuple[NamedUser, str]]: + """Discover contributors to a repo who are not in the known set. + + Returns: + List of (NamedUser, display_name) tuples for community contributors. + """ + community: list[tuple[NamedUser, str]] = [] + try: + for contributor in repo.get_contributors(): + if contributor.login in known_usernames: + continue + display_name = contributor.name or contributor.login + community.append((contributor, display_name)) + if len(community) >= max_community: + break + except RateLimitExceededException: + print(f" Rate limit hit discovering contributors for {repo.full_name}") + except GithubException as e: + print(f" Error discovering contributors for {repo.full_name}: {e}") + return community + + def update_community_links( + self, + repo: Repo, + community_contributors: list[tuple[NamedUser, str]], + ) -> None: + """Create/update link data for community contributors.""" + for contributor, display_name in community_contributors: + try: + self.update_link(repo, contributor, display_name, tier="community") + except GithubException as e: + print(f" Skipping community contributor {contributor.login}: {e}") def update_repository_community_stats( - self, repo_full_name: str, devseed_count: int + self, repo_full_name: str, core_count: int ) -> None: """Update the community stats for a repository after processing contributors.""" path = self.directory / "repositories" / (repo_full_name + ".json") if path.exists(): repository = Repository.model_validate_json(path.read_text()) - repository.update_community_stats(devseed_count) + repository.update_community_stats(core_count) + path.write_text(repository.model_dump_json()) + + def discover_forking_organizations(self, repo: Repo) -> list[str]: + """Discover organizations that have forked this repository.""" + org_names: list[str] = [] + try: + for fork in repo.get_forks(): + if fork.owner.type == "Organization": + org_names.append(fork.owner.login) + except RateLimitExceededException: + print(f" Rate limit hit discovering forks for {repo.full_name}") + except GithubException as e: + print(f" Error discovering forks for {repo.full_name}: {e}") + return sorted(set(org_names)) + + def update_repository_forking_orgs(self, repo: Repo) -> None: + """Fetch forking organizations and persist to the repository JSON.""" + orgs = self.discover_forking_organizations(repo) + path = self.directory / "repositories" / (repo.full_name + ".json") + if path.exists(): + repository = Repository.model_validate_json(path.read_text()) + repository.update_forking_organizations(orgs) path.write_text(repository.model_dump_json()) + if orgs: + print(f" Forking orgs for {repo.full_name}: {', '.join(orgs)}") def update_link( - self, repo: Repo, contributor: NamedUser, contributor_name: str + self, + repo: Repo, + contributor: NamedUser, + contributor_name: str, + *, + tier: str = "core", ) -> None: """Update the link for a single contributor to a single repository.""" path = self.directory / "links" / repo.full_name / (contributor.login + ".json") @@ -56,6 +143,8 @@ def update_link( if path.exists(): link = Link.model_validate_json(path.read_text()) link.update_from_github(repo, contributor) + link.tier = tier else: link = Link.from_github(repo, contributor, contributor_name) + link.tier = tier path.write_text(link.model_dump_json()) diff --git a/python/contributor_network/models.py b/python/contributor_network/models.py index c5ff118..241414f 100644 --- a/python/contributor_network/models.py +++ b/python/contributor_network/models.py @@ -16,6 +16,7 @@ class Link(BaseModel): # Phase 1: Computed contribution metrics contribution_span_days: int = 0 is_recent_contributor: bool = False + tier: str = "core" @classmethod def from_github(cls, repo: Repo, contributor: NamedUser, author_name: str) -> Link: @@ -79,9 +80,10 @@ class Repository(BaseModel): repo_archived: bool = False # Phase 2: Community metrics repo_total_contributors: int = 0 - repo_devseed_contributors: int = 0 + repo_core_contributors: int = 0 repo_external_contributors: int = 0 repo_community_ratio: float = 0.0 + repo_forking_organizations: str = "" @classmethod def from_github(cls, repo: Repo) -> Repository: @@ -116,16 +118,17 @@ def from_github(cls, repo: Repo) -> Repository: repo_total_contributors=total_contributors, ) - def update_community_stats(self, devseed_count: int) -> None: - """Update community metrics given the count of DevSeed contributors. - - Call this after processing contributors for the repository. - """ - self.repo_devseed_contributors = devseed_count - self.repo_external_contributors = self.repo_total_contributors - devseed_count + def update_community_stats(self, core_count: int) -> None: + """Update community metrics given the count of core contributors.""" + self.repo_core_contributors = core_count + self.repo_external_contributors = self.repo_total_contributors - core_count if self.repo_total_contributors > 0: self.repo_community_ratio = round( self.repo_external_contributors / self.repo_total_contributors, 3 ) else: self.repo_community_ratio = 0.0 + + def update_forking_organizations(self, org_names: list[str]) -> None: + """Store the list of organizations that have forked this repository.""" + self.repo_forking_organizations = ",".join(sorted(set(org_names))) From 8b5ea39c60c075ebae8109f21801a6c3ef1459ad Mon Sep 17 00:00:00 2001 From: Anthony Boyd <92742765+aboydnw@users.noreply.github.com> Date: Thu, 26 Feb 2026 18:36:17 -0500 Subject: [PATCH 2/2] cli/config: add core_usernames, is_core, tier-aware build - Config: add core_usernames property and is_core() method - Build: pass core_usernames to client for tier tagging - Build: remap tiers in CSV generation based on config - Build: add --fetch-forking-orgs option - Config.json: include core_contributors and visualization settings Made-with: Cursor --- python/contributor_network/cli.py | 43 +++++++++++++++++++++------- python/contributor_network/config.py | 9 ++++++ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/python/contributor_network/cli.py b/python/contributor_network/cli.py index 0b9a774..57b1193 100644 --- a/python/contributor_network/cli.py +++ b/python/contributor_network/cli.py @@ -280,10 +280,12 @@ def _fetch_data( data_dir: Path, github_token: str | None, all_contributors: bool, + fetch_forking_orgs: bool = False, ) -> None: """Fetch contribution data from GitHub into JSON files.""" repositories = cfg.load_repositories() contributors = cfg.all_contributors if all_contributors else cfg.core_contributors + core_usernames = cfg.core_usernames if not repositories: click.echo("No repositories in config.toml") @@ -309,18 +311,35 @@ def _fetch_data( client.update_repository(repo) click.echo(" Updating links...") - client.update_links(repo, contributors) + client.update_links(repo, contributors, core_usernames=core_usernames) + + if fetch_forking_orgs: + client.update_repository_forking_orgs(repo) click.echo("Done fetching data.") def _generate_csvs(cfg: Config, data_dir: Path) -> None: """Generate visualization CSVs and config.json from fetched JSON data.""" - contributors = cfg.all_contributors - authors = list(contributors.values()) - click.echo(f"Writing CSVs for {len(authors)} contributors") + core_names = set(cfg.core_contributors.values()) + + links = [] + for path in (data_dir / "links").glob("**/*.json"): + links.append(Link.model_validate_json(path.read_text()).model_dump(mode="json")) - (data_dir / "top_contributors.csv").write_text("\n".join(["author_name"] + authors)) + for link in links: + if link["author_name"] in core_names: + link["tier"] = "core" + else: + link["tier"] = "community" + + core_count = sum(1 for lnk in links if lnk["tier"] == "core") + community_count = sum(1 for lnk in links if lnk["tier"] == "community") + unique_authors = {lnk["author_name"] for lnk in links} + click.echo( + f"Writing CSVs: {len(unique_authors)} contributors " + f"({core_count} core links, {community_count} community links)" + ) repositories = [] for path in (data_dir / "repositories").glob("**/*.json"): @@ -333,9 +352,6 @@ def _generate_csvs(cfg: Config, data_dir: Path) -> None: writer.writeheader() writer.writerows(repositories) - links = [] - for path in (data_dir / "links").glob("**/*.json"): - links.append(Link.model_validate_json(path.read_text()).model_dump(mode="json")) with open(data_dir / "links.csv", "w", newline="") as f: fieldnames = list(Link.model_json_schema()["properties"].keys()) writer = csv.DictWriter(f, fieldnames=fieldnames) @@ -344,11 +360,12 @@ def _generate_csvs(cfg: Config, data_dir: Path) -> None: config_json = { "title": cfg.title, - "author": cfg.author, "description": cfg.description, "organization_name": cfg.organization_name, "contributor_padding": cfg.contributor_padding, "contributors": cfg.all_contributors, + "core_contributors": cfg.core_contributors, + "visualization": cfg.visualization.model_dump(), } (data_dir / "config.json").write_text( json.dumps(config_json, indent=2, ensure_ascii=False) @@ -422,6 +439,11 @@ def _assemble_site(cfg: Config, data_dir: Path, destination: Path) -> None: is_flag=True, help="Include all contributor groups when fetching link data", ) +@click.option( + "--fetch-forking-orgs", + is_flag=True, + help="Discover which organizations have forked each repo (extra API calls)", +) @click.option( "--skip-fetch", is_flag=True, @@ -438,6 +460,7 @@ def build( data_dir: Path, github_token: str | None, all_contributors: bool, + fetch_forking_orgs: bool, skip_fetch: bool, csvs_only: bool, ) -> None: @@ -465,7 +488,7 @@ def build( if skip_fetch: click.echo("Skipping GitHub fetch (--skip-fetch)") else: - _fetch_data(cfg, data_dir, github_token, all_contributors) + _fetch_data(cfg, data_dir, github_token, all_contributors, fetch_forking_orgs) _generate_csvs(cfg, data_dir) diff --git a/python/contributor_network/config.py b/python/contributor_network/config.py index 12dc419..6f58194 100644 --- a/python/contributor_network/config.py +++ b/python/contributor_network/config.py @@ -125,6 +125,15 @@ def all_contributors(self) -> dict[str, str]: """Map of username -> display name for every contributor.""" return {c.username: c.name for c in self.load_contributors()} + @property + def core_usernames(self) -> set[str]: + """Set of usernames classified as core contributors.""" + return set(self.core_contributors.keys()) + + def is_core(self, username: str) -> bool: + """Check whether a username is a core contributor.""" + return username in self.core_usernames + @property def devseed_contributors(self) -> dict[str, str]: """Only the first contributor group (backward compat)."""