Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 33 additions & 10 deletions python/contributor_network/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,12 @@ def _fetch_data(
data_dir: Path,
github_token: str | None,
all_contributors: bool,
fetch_forking_orgs: bool = False,
) -> None:
"""Fetch contribution data from GitHub into JSON files."""
repositories = cfg.load_repositories()
contributors = cfg.all_contributors if all_contributors else cfg.core_contributors
core_usernames = cfg.core_usernames

if not repositories:
click.echo("No repositories in config.toml")
Expand All @@ -309,18 +311,35 @@ def _fetch_data(
client.update_repository(repo)

click.echo(" Updating links...")
client.update_links(repo, contributors)
client.update_links(repo, contributors, core_usernames=core_usernames)

if fetch_forking_orgs:
client.update_repository_forking_orgs(repo)

click.echo("Done fetching data.")


def _generate_csvs(cfg: Config, data_dir: Path) -> None:
"""Generate visualization CSVs and config.json from fetched JSON data."""
contributors = cfg.all_contributors
authors = list(contributors.values())
click.echo(f"Writing CSVs for {len(authors)} contributors")
core_names = set(cfg.core_contributors.values())

links = []
for path in (data_dir / "links").glob("**/*.json"):
links.append(Link.model_validate_json(path.read_text()).model_dump(mode="json"))

(data_dir / "top_contributors.csv").write_text("\n".join(["author_name"] + authors))
for link in links:
if link["author_name"] in core_names:
link["tier"] = "core"
else:
link["tier"] = "community"

core_count = sum(1 for lnk in links if lnk["tier"] == "core")
community_count = sum(1 for lnk in links if lnk["tier"] == "community")
unique_authors = {lnk["author_name"] for lnk in links}
click.echo(
f"Writing CSVs: {len(unique_authors)} contributors "
f"({core_count} core links, {community_count} community links)"
)

repositories = []
for path in (data_dir / "repositories").glob("**/*.json"):
Expand All @@ -333,9 +352,6 @@ def _generate_csvs(cfg: Config, data_dir: Path) -> None:
writer.writeheader()
writer.writerows(repositories)

links = []
for path in (data_dir / "links").glob("**/*.json"):
links.append(Link.model_validate_json(path.read_text()).model_dump(mode="json"))
with open(data_dir / "links.csv", "w", newline="") as f:
fieldnames = list(Link.model_json_schema()["properties"].keys())
writer = csv.DictWriter(f, fieldnames=fieldnames)
Expand All @@ -344,11 +360,12 @@ def _generate_csvs(cfg: Config, data_dir: Path) -> None:

config_json = {
"title": cfg.title,
"author": cfg.author,
"description": cfg.description,
"organization_name": cfg.organization_name,
"contributor_padding": cfg.contributor_padding,
"contributors": cfg.all_contributors,
"core_contributors": cfg.core_contributors,
"visualization": cfg.visualization.model_dump(),
}
(data_dir / "config.json").write_text(
json.dumps(config_json, indent=2, ensure_ascii=False)
Expand Down Expand Up @@ -422,6 +439,11 @@ def _assemble_site(cfg: Config, data_dir: Path, destination: Path) -> None:
is_flag=True,
help="Include all contributor groups when fetching link data",
)
@click.option(
"--fetch-forking-orgs",
is_flag=True,
help="Discover which organizations have forked each repo (extra API calls)",
)
@click.option(
"--skip-fetch",
is_flag=True,
Expand All @@ -438,6 +460,7 @@ def build(
data_dir: Path,
github_token: str | None,
all_contributors: bool,
fetch_forking_orgs: bool,
skip_fetch: bool,
csvs_only: bool,
) -> None:
Expand Down Expand Up @@ -465,7 +488,7 @@ def build(
if skip_fetch:
click.echo("Skipping GitHub fetch (--skip-fetch)")
else:
_fetch_data(cfg, data_dir, github_token, all_contributors)
_fetch_data(cfg, data_dir, github_token, all_contributors, fetch_forking_orgs)

_generate_csvs(cfg, data_dir)

Expand Down
109 changes: 99 additions & 10 deletions python/contributor_network/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from github import Github
from github.Auth import Auth
from github.GithubException import GithubException, RateLimitExceededException
from github.NamedUser import NamedUser
from github.Repository import Repository as Repo

Expand All @@ -26,36 +27,124 @@ def update_repository(self, repo: Repo) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(repository.model_dump_json())

def update_links(self, repo: Repo, contributors: dict[str, str]) -> None:
"""Update the links for a single repository."""
devseed_count = 0
def update_links(
self,
repo: Repo,
contributors: dict[str, str],
*,
core_usernames: set[str] | None = None,
) -> None:
"""Update the links for a single repository.

Args:
repo: GitHub repository object
contributors: Map of username -> display name for known contributors
core_usernames: Set of usernames classified as "core".
If provided, links are tagged with tier="core" or "community".
If None, all links default to tier="core".
"""
core_count = 0
for contributor in repo.get_contributors():
if contributor_name := contributors.get(contributor.login):
self.update_link(repo, contributor, contributor_name)
devseed_count += 1
tier = "core"
if core_usernames is not None:
tier = (
"core" if contributor.login in core_usernames else "community"
)
self.update_link(repo, contributor, contributor_name, tier=tier)
core_count += 1

self.update_repository_community_stats(repo.full_name, core_count)

# Update repository with community stats (Phase 2)
self.update_repository_community_stats(repo.full_name, devseed_count)
def discover_community_contributors(
self,
repo: Repo,
known_usernames: set[str],
*,
max_community: int = 100,
) -> list[tuple[NamedUser, str]]:
"""Discover contributors to a repo who are not in the known set.

Returns:
List of (NamedUser, display_name) tuples for community contributors.
"""
community: list[tuple[NamedUser, str]] = []
try:
for contributor in repo.get_contributors():
if contributor.login in known_usernames:
continue
display_name = contributor.name or contributor.login
community.append((contributor, display_name))
if len(community) >= max_community:
break
except RateLimitExceededException:
print(f" Rate limit hit discovering contributors for {repo.full_name}")
except GithubException as e:
print(f" Error discovering contributors for {repo.full_name}: {e}")
return community

def update_community_links(
self,
repo: Repo,
community_contributors: list[tuple[NamedUser, str]],
) -> None:
"""Create/update link data for community contributors."""
for contributor, display_name in community_contributors:
try:
self.update_link(repo, contributor, display_name, tier="community")
except GithubException as e:
print(f" Skipping community contributor {contributor.login}: {e}")

def update_repository_community_stats(
self, repo_full_name: str, devseed_count: int
self, repo_full_name: str, core_count: int
) -> None:
"""Update the community stats for a repository after processing contributors."""
path = self.directory / "repositories" / (repo_full_name + ".json")
if path.exists():
repository = Repository.model_validate_json(path.read_text())
repository.update_community_stats(devseed_count)
repository.update_community_stats(core_count)
path.write_text(repository.model_dump_json())

def discover_forking_organizations(self, repo: Repo) -> list[str]:
"""Discover organizations that have forked this repository."""
org_names: list[str] = []
try:
for fork in repo.get_forks():
if fork.owner.type == "Organization":
org_names.append(fork.owner.login)
except RateLimitExceededException:
print(f" Rate limit hit discovering forks for {repo.full_name}")
except GithubException as e:
print(f" Error discovering forks for {repo.full_name}: {e}")
return sorted(set(org_names))

def update_repository_forking_orgs(self, repo: Repo) -> None:
"""Fetch forking organizations and persist to the repository JSON."""
orgs = self.discover_forking_organizations(repo)
path = self.directory / "repositories" / (repo.full_name + ".json")
if path.exists():
repository = Repository.model_validate_json(path.read_text())
repository.update_forking_organizations(orgs)
path.write_text(repository.model_dump_json())
if orgs:
print(f" Forking orgs for {repo.full_name}: {', '.join(orgs)}")

def update_link(
self, repo: Repo, contributor: NamedUser, contributor_name: str
self,
repo: Repo,
contributor: NamedUser,
contributor_name: str,
*,
tier: str = "core",
) -> None:
"""Update the link for a single contributor to a single repository."""
path = self.directory / "links" / repo.full_name / (contributor.login + ".json")
path.parent.mkdir(parents=True, exist_ok=True)
if path.exists():
link = Link.model_validate_json(path.read_text())
link.update_from_github(repo, contributor)
link.tier = tier
else:
link = Link.from_github(repo, contributor, contributor_name)
link.tier = tier
path.write_text(link.model_dump_json())
9 changes: 9 additions & 0 deletions python/contributor_network/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,15 @@ def all_contributors(self) -> dict[str, str]:
"""Map of username -> display name for every contributor."""
return {c.username: c.name for c in self.load_contributors()}

@property
def core_usernames(self) -> set[str]:
"""Set of usernames classified as core contributors."""
return set(self.core_contributors.keys())

def is_core(self, username: str) -> bool:
"""Check whether a username is a core contributor."""
return username in self.core_usernames

@property
def devseed_contributors(self) -> dict[str, str]:
"""Only the first contributor group (backward compat)."""
Expand Down
19 changes: 11 additions & 8 deletions python/contributor_network/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Link(BaseModel):
# Phase 1: Computed contribution metrics
contribution_span_days: int = 0
is_recent_contributor: bool = False
tier: str = "core"

@classmethod
def from_github(cls, repo: Repo, contributor: NamedUser, author_name: str) -> Link:
Expand Down Expand Up @@ -79,9 +80,10 @@ class Repository(BaseModel):
repo_archived: bool = False
# Phase 2: Community metrics
repo_total_contributors: int = 0
repo_devseed_contributors: int = 0
repo_core_contributors: int = 0
repo_external_contributors: int = 0
repo_community_ratio: float = 0.0
repo_forking_organizations: str = ""

@classmethod
def from_github(cls, repo: Repo) -> Repository:
Expand Down Expand Up @@ -116,16 +118,17 @@ def from_github(cls, repo: Repo) -> Repository:
repo_total_contributors=total_contributors,
)

def update_community_stats(self, devseed_count: int) -> None:
"""Update community metrics given the count of DevSeed contributors.

Call this after processing contributors for the repository.
"""
self.repo_devseed_contributors = devseed_count
self.repo_external_contributors = self.repo_total_contributors - devseed_count
def update_community_stats(self, core_count: int) -> None:
"""Update community metrics given the count of core contributors."""
self.repo_core_contributors = core_count
self.repo_external_contributors = self.repo_total_contributors - core_count
if self.repo_total_contributors > 0:
self.repo_community_ratio = round(
self.repo_external_contributors / self.repo_total_contributors, 3
)
else:
self.repo_community_ratio = 0.0

def update_forking_organizations(self, org_names: list[str]) -> None:
"""Store the list of organizations that have forked this repository."""
self.repo_forking_organizations = ",".join(sorted(set(org_names)))