From df03bba2018a951d90f553522b6c19f726086dc2 Mon Sep 17 00:00:00 2001 From: mac Date: Tue, 5 May 2026 22:57:02 +0800 Subject: [PATCH 1/4] added some docs, fixed lint error --- .gitattributes | 15 ++ .github/CODEOWNERS | 8 + .github/ISSUE_TEMPLATE/bug_report.yml | 59 +++++ .github/ISSUE_TEMPLATE/config.yml | 2 + .github/ISSUE_TEMPLATE/feature_request.yml | 34 +++ .github/dependabot.yml | 13 ++ .github/pull_request_template.md | 12 + .github/workflows/ci.yml | 27 ++- .github/workflows/codeql.yml | 33 +++ .pre-commit-config.yaml | 17 ++ .vscode/settings.json | 2 +- CHANGELOG.md | 18 ++ CODE_OF_CONDUCT.md | 77 +++++++ CONTRIBUTING.md | 77 +++++++ README.md | 242 ++++++++++++--------- SECURITY.md | 25 +++ docs/handoff.md | 69 ++++++ docs/onboarding.md | 227 +++++++++++++++++++ pyproject.toml | 23 ++ src/paperscout/__init__.py | 2 +- src/paperscout/__main__.py | 34 +-- src/paperscout/config.py | 2 +- src/paperscout/db.py | 2 +- src/paperscout/health.py | 32 +-- src/paperscout/monitor.py | 91 +++++--- src/paperscout/scout.py | 42 +++- src/paperscout/sources.py | 98 +++++---- src/paperscout/storage.py | 36 +-- tests/conftest.py | 13 +- tests/test_health.py | 2 + tests/test_models.py | 26 ++- tests/test_monitor.py | 46 ++-- tests/test_scout.py | 110 +++++++--- tests/test_sources.py | 97 ++++++--- tests/test_storage.py | 43 ++-- 35 files changed, 1304 insertions(+), 352 deletions(-) create mode 100644 .gitattributes create mode 100644 .github/CODEOWNERS create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/dependabot.yml create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/codeql.yml create mode 100644 .pre-commit-config.yaml create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 SECURITY.md create mode 100644 docs/handoff.md create mode 100644 docs/onboarding.md diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..5fed995 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,15 @@ +# Default: normalize line endings in the repo checkout. +* text=auto + +# Unix-style scripts and CI configs +*.sh text eol=lf +.github/**/*.yml text eol=lf + +# Python sources +*.py text eol=lf + +# Markdown and docs +*.md text eol=lf + +# Windows batch files (if any) +*.bat text eol=crlf diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..912c957 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,8 @@ +# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners +# +# Uncomment and set one or more GitHub usernames or @org/team-name entries. +# Teams must exist in the org and have visibility to this repository. +# +# Example: +# * @alice @bob +# /docs/ @alice diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..212df45 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,59 @@ +name: Bug report +description: File a bug report to help us improve paperscout +labels: ["bug"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report. + **Do not** use this form for security vulnerabilities — follow **[SECURITY.md](../SECURITY.md)** (private reporting). + + - type: textarea + id: summary + attributes: + label: Summary + description: What went wrong, in one short paragraph? + validations: + required: true + + - type: textarea + id: repro + attributes: + label: Steps to reproduce + description: How can a maintainer reproduce the issue? + placeholder: | + 1. … + 2. … + 3. … + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behavior + validations: + required: true + + - type: textarea + id: actual + attributes: + label: Actual behavior + validations: + required: true + + - type: input + id: version + attributes: + label: Version / commit + description: Git SHA or release tag, if known + validations: + required: false + + - type: textarea + id: env + attributes: + label: Environment + description: OS, Python version, Docker vs bare metal, etc. + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..64eb98d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,2 @@ +blank_issues_enabled: true +contact_links: [] diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..32e1aee --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,34 @@ +name: Feature request +description: Suggest an idea for paperscout +labels: ["enhancement"] +body: + - type: textarea + id: problem + attributes: + label: Problem / motivation + description: What user or operator problem does this solve? + validations: + required: true + + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: What would you like to see happen? + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + validations: + required: false + + - type: textarea + id: context + attributes: + label: Additional context + description: Links, screenshots, Slack behavior, etc. + validations: + required: false diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..8be1ecc --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,13 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 10 + + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 10 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..1e7519f --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,12 @@ +## Summary + + + +## Test plan + +- [ ] `./run check` (or `make check`) +- [ ] `pre-commit run --all-files` + +## Related issues + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d11cb54..85d5d21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,11 +2,34 @@ name: CI on: push: - branches: [main] + branches: [main, develop] pull_request: - branches: [main] + branches: [main, develop] jobs: + lint: + name: Lint (pre-commit) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Cache pre-commit environments + uses: actions/cache@v4 + with: + path: ~/.cache/pre-commit + key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }} + + - name: Run pre-commit + run: pre-commit run --all-files + test: name: Test (Python ${{ matrix.python-version }}) runs-on: ubuntu-latest diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..585eb80 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,33 @@ +name: CodeQL + +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + - cron: "0 12 * * 1" + +jobs: + analyze: + name: Analyze (Python) + runs-on: ubuntu-latest + permissions: + security-events: write + actions: read + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: python + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9da40d6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +# See https://pre-commit.com +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + args: ["--maxkb=800"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.6 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format diff --git a/.vscode/settings.json b/.vscode/settings.json index 7e68766..af690fc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { "python-envs.pythonProjects": [] -} \ No newline at end of file +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3afa3ec --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,18 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Open-source hygiene: contributing guide, security policy, code of conduct, onboarding and handoff docs, pre-commit (Ruff), GitHub issue templates, Dependabot, CodeQL, CODEOWNERS template, and `.gitattributes`. + +## [0.1.0] - 2026-05-05 + +### Added + +- Initial public release as tracked in `pyproject.toml` (WG21 paper tracking, Slack integration, PostgreSQL storage, Docker deploy, CI/CD workflows). diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..ea5a2d0 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,77 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience +- Focusing on what is best not just for us as individuals, but for the overall community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or advances of any kind +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email address, without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the **repository maintainers** (see [GitHub organization](https://github.com/cppalliance) or use [GitHub Report abuse](https://github.com/contact/report-abuse) for content hosted on GitHub). Replace this paragraph with a dedicated conduct email or ticketing process when your organization publishes one. + +All community leaders are obligated to respect the privacy and security of the reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of actions. + +**Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). + +For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at [https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..5a43138 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,77 @@ +# Contributing to paperscout + +Thank you for your interest in improving paperscout. This document describes how we work, how to run checks locally, and how releases are cut. + +## Community standards + +- Please read the [Code of Conduct](CODE_OF_CONDUCT.md). Participation is governed by the **Contributor Covenant 2.1**. +- Report **security vulnerabilities** privately — see [SECURITY.md](SECURITY.md). Do not open public issues for undisclosed security problems. + +## Where to start + +- **[docs/onboarding.md](docs/onboarding.md)** — clone, database, `.env`, tests, and running the app locally. +- **[docs/handoff.md](docs/handoff.md)** — maintainer-oriented design notes and operational gotchas. +- **[README.md](README.md)** — product behavior, Slack setup, deployment, and environment variable tables. + +## Workflow + +1. **Fork** the repository (if you lack direct push access) and **clone** your fork. +2. Create a **feature branch** from `main` (this repo’s default branch; open PRs target `main`). +3. Make focused commits with clear messages. +4. Open a **pull request** against `main`. Use the PR template; link related issues when applicable. +5. Ensure **CI is green** (tests + coverage + lint hooks — see below). + +### Code owners + +[`.github/CODEOWNERS`](.github/CODEOWNERS) routes review requests. If GitHub reports unknown owners, maintainers should update that file with real `@username` or `@org/team` entries and ensure the team exists and has repository access. + +## Local checks + +Install the package in editable mode with dev dependencies: + +```bash +python -m venv .venv +source .venv/bin/activate # Windows Git Bash: source .venv/Scripts/activate +pip install -e ".[dev]" +``` + +### Tests and coverage + +```bash +./run check # pytest + coverage, fails under 90% line coverage (matches CI) +# or: make check +``` + +CI runs the same coverage gate on Python **3.10**, **3.11**, and **3.12** (see `.github/workflows/ci.yml`). + +### Lint and format (Ruff + pre-commit) + +We use **[pre-commit](https://pre-commit.com/)** with **[Ruff](https://docs.astral.sh/ruff/)** for linting and formatting. + +```bash +pre-commit install +pre-commit run --all-files +``` + +CI runs `pre-commit run --all-files` on every push and pull request (see the `lint` job in `.github/workflows/ci.yml`). + +## Expectations for changes + +- **Tests** — Add or update tests for behavior changes. Keep coverage at or above the project floor (**90%**). +- **Docs** — Update README, onboarding, or handoff when you change operator-visible behavior, env vars, or deployment steps. +- **Style** — Let Ruff format the tree; avoid unrelated drive-by reformatting of untouched files in the same PR when possible. + +## Releases + +We follow **[Semantic Versioning](https://semver.org/)** and **[Keep a Changelog](https://keepachangelog.com/)** principles. + +1. **Version** — Bump `version` in [`pyproject.toml`](pyproject.toml) (e.g. `0.1.0` → `0.2.0`). +2. **Changelog** — Move items from `## [Unreleased]` to a new section `## [x.y.z] - YYYY-MM-DD` in [`CHANGELOG.md`](CHANGELOG.md). +3. **Tag** — Create an annotated tag: `git tag -a v0.2.0 -m "Release v0.2.0"` and push it: `git push origin v0.2.0`. +4. **GitHub Release** — On GitHub, create a **Release** from that tag and paste the changelog section for that version into the release notes. + +Optional follow-ups (not required today): PyPI publishing workflow, signed tags, or automated release notes. + +## Questions + +Use **GitHub Issues** for bugs and feature ideas (see issue templates). For organizational or access questions, contact the **CppAlliance** maintainers responsible for this repository (replace with a concrete contact when publishing internally). diff --git a/README.md b/README.md index 803797f..f8746d3 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ WG21 C++ paper tracker with ISO draft probing and Slack notifications. A Python project that probes the isocpp.org paper system for unpublished D-paper drafts, monitors for new paper assignments at the frontier, and notifies a Slack channel when watched authors publish. +**Docs:** [Developer onboarding](docs/onboarding.md) (clone → DB → tests → run) · [Maintainer handoff](docs/handoff.md) · [Contributing](CONTRIBUTING.md) · [Changelog](CHANGELOG.md) · [Security](SECURITY.md) · [Code of conduct](CODE_OF_CONDUCT.md) + +If you only need to run tests or a local instance, start with **onboarding** before the Slack app sections below. + ## Features - **Per-user watchlists** -- each user manages their own list of authors and paper numbers via DM; the scout sends a personal DM when a match is found @@ -29,18 +33,18 @@ A Python project that probes the isocpp.org paper system for unpublished D-paper Go to **OAuth & Permissions** in the left sidebar. Under **Bot Token Scopes**, add: -| Scope | Why | -|-------|-----| -| `chat:write` | Post messages to channels and send DMs | -| `chat:write.public` | Post to public channels the scout hasn't been invited to | -| `im:history` | Read messages in 1:1 DMs with the scout | -| `im:write` | Open 1:1 DM conversations to deliver watchlist alerts | -| `mpim:history` | Read messages in group DMs the scout has been invited to | -| `mpim:write` | Reply in group DMs | -| `channels:history` | Read messages in public channels | -| `groups:history` | Read messages in private channels the scout is invited to | -| `groups:write` | Reply in private channels | -| `app_mentions:read` | Respond when someone `@paperscout`s | +| Scope | Why | +| ------------------- | --------------------------------------------------------- | +| `chat:write` | Post messages to channels and send DMs | +| `chat:write.public` | Post to public channels the scout hasn't been invited to | +| `im:history` | Read messages in 1:1 DMs with the scout | +| `im:write` | Open 1:1 DM conversations to deliver watchlist alerts | +| `mpim:history` | Read messages in group DMs the scout has been invited to | +| `mpim:write` | Reply in group DMs | +| `channels:history` | Read messages in public channels | +| `groups:history` | Read messages in private channels the scout is invited to | +| `groups:write` | Reply in private channels | +| `app_mentions:read` | Respond when someone `@paperscout`s | > **Note on group DMs (`mpim`):** When the scout is invited to a group DM, `watchlist` commands are rejected with a friendly error telling the user to use a 1:1 DM instead. `status` and `help` work normally. The `mpim:history` and `mpim:write` scopes are needed to receive and reply to those messages. @@ -81,7 +85,7 @@ cp .env.example .env Edit `.env` with your credentials and preferences: -```env +````env SLACK_SIGNING_SECRET= SLACK_BOT_TOKEN=xoxb- PORT=3000 @@ -102,7 +106,7 @@ Install and run: ```bash pip install -e . python -m paperscout -``` +```` ### 7. Set the Request URL @@ -137,18 +141,37 @@ ngrok http 3000 6. Type `@paperscout status` in a channel — should reply in-thread 7. Check your notification channel after 30 minutes — frontier hits and D→P transitions appear there; personal watchlist matches arrive as DMs -### Production Deployment +### Deployment -The scout runs as a Docker container deployed via CD on every push to `main`. It connects to the host's shared PostgreSQL and sits behind nginx (TLS on `:443`). +The scout runs as a Docker container deployed via CD. A push to **`main`** deploys to **production**; a push to **`develop`** deploys to **staging**. Both paths run the same workflow and the same job — only the **GitHub Environment** changes. +```text +Push to main → CI tests → SSH into prod → git pull --ff-only → docker compose up --build → Health check (retry) +Push to develop → CI tests → SSH into staging → git pull --ff-only → docker compose up --build → Health check (retry) ``` -Push to main → CI tests → SSH into server → git pull → docker compose up --build → Health check -``` -Quick start on a fresh server: +#### Configure GitHub Environments + +Create two environments under **Settings → Environments**: `production` and `staging`. Both use the **same secret names** (different values per environment) and a small set of per-environment **Variables**: + +| Type | Name | Production | Staging | +| -------- | ---------------- | ----------------------- | --------------------------------- | +| Secret | `SERVER_HOST` | prod host / IP | staging host / IP | +| Secret | `SERVER_USER` | deploy user | deploy user | +| Secret | `SERVER_SSH_KEY` | private key | private key | +| Secret | `SERVER_PORT` | optional (default `22`) | optional (default `22`) | +| Variable | `DEPLOY_PATH` | `/opt/paperscout` | `/opt/paperscout-staging` | +| Variable | `DEPLOY_BRANCH` | `main` | `develop` | +| Variable | `HEALTH_PORT` | `9101` | `9102` (or whatever staging maps) | + +The workflow picks the environment from the branch (`refs/heads/main` → `production`, `refs/heads/develop` → `staging`), so values like `DEPLOY_PATH` and `HEALTH_PORT` are not hard-coded in the YAML. + +> Tip: enable **Required reviewers** on the `production` environment for a manual approval gate before prod deploys. + +#### Quick start on a fresh server ```bash -# On the server (after Docker, PostgreSQL, and nginx are set up) +# On the production server (after Docker, PostgreSQL, and nginx are set up) git clone https://github.com/cppalliance/paperscout-python.git /opt/paperscout cd /opt/paperscout cp .env.example .env # edit with real credentials @@ -156,6 +179,16 @@ docker compose up -d --build curl -sf http://localhost:9101/health ``` +On the **staging** server (separate host or separate path on the same host; must match the `staging` environment's `DEPLOY_PATH` and expose `/health` on `HEALTH_PORT`): + +```bash +git clone -b develop https://github.com/cppalliance/paperscout-python.git /opt/paperscout-staging +cd /opt/paperscout-staging +cp .env.example .env # use staging credentials / DB / Slack app as appropriate +docker compose up -d --build +curl -sf http://localhost:9102/health +``` + See [`deploy/SERVER_SETUP.md`](deploy/SERVER_SETUP.md) for the full Ubuntu 22.04 provisioning guide, and [`.github/workflows/cd.yml`](.github/workflows/cd.yml) for the CD pipeline. Database backups run daily via [`.github/workflows/db-backup.yml`](.github/workflows/db-backup.yml), uploading `pg_dump` snapshots to Google Cloud Storage. @@ -164,14 +197,14 @@ Database backups run daily via [`.github/workflows/db-backup.yml`](.github/workf Watchlist commands only work in a **1:1 DM** with the scout (each user has their own independent watchlist). `status` and `help` work everywhere — DMs, group DMs, and channels via `@paperscout`. -| Command | Where | Description | -|---------|-------|-------------| -| `watchlist` | DM only | Show your personal watchlist | -| `watchlist list` | DM only | Show your personal watchlist | -| `watchlist add ` | DM only | Add an author name substring *or* paper number — type is auto-detected | -| `watchlist remove ` | DM only | Remove an entry from your watchlist | -| `status` | Anywhere | Show papers loaded, last poll time, probe stats | -| `help` | Anywhere | Show command summary | +| Command | Where | Description | +| ----------------------------------- | -------- | ---------------------------------------------------------------------- | +| `watchlist` | DM only | Show your personal watchlist | +| `watchlist list` | DM only | Show your personal watchlist | +| `watchlist add ` | DM only | Add an author name substring _or_ paper number — type is auto-detected | +| `watchlist remove ` | DM only | Remove an entry from your watchlist | +| `status` | Anywhere | Show papers loaded, last poll time, probe stats | +| `help` | Anywhere | Show command summary | ### Watchlist matching @@ -186,84 +219,85 @@ All parameters are configurable via environment variables or a `.env` file. See ### Required -| Variable | Description | -|----------|-------------| -| `SLACK_SIGNING_SECRET` | Slack app signing secret | -| `SLACK_BOT_TOKEN` | Slack bot token (`xoxb-...`) | -| `DATABASE_URL` | PostgreSQL connection string (`postgresql://user:pass@host:5432/db`) | +| Variable | Description | +| ---------------------- | -------------------------------------------------------------------- | +| `SLACK_SIGNING_SECRET` | Slack app signing secret | +| `SLACK_BOT_TOKEN` | Slack bot token (`xoxb-...`) | +| `DATABASE_URL` | PostgreSQL connection string (`postgresql://user:pass@host:5432/db`) | ### Scheduling -| Variable | Default | Description | -|----------|---------|-------------| -| `POLL_INTERVAL_MINUTES` | `30` | Main polling cycle interval | -| `ENABLE_BULK_WG21` | `true` | Fetch wg21.link/index.json each cycle | -| `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org scraping (not yet scheduled) | -| `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle | +| Variable | Default | Description | +| ----------------------- | ------- | ------------------------------------------------------ | +| `POLL_INTERVAL_MINUTES` | `30` | Main polling cycle interval | +| `POLL_OVERRUN_COOLDOWN_SECONDS` | `300` | Minimum sleep after a poll cycle that overran the interval (avoids tight loops when work or errors stretch a cycle) | +| `ENABLE_BULK_WG21` | `true` | Fetch wg21.link/index.json each cycle | +| `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org scraping (not yet scheduled) | +| `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle | ### Probe Prefixes / Extensions -| Variable | Default | Description | -|----------|---------|-------------| -| `PROBE_PREFIXES` | `["D","P"]` | Prefixes for gap/unknown numbers | -| `PROBE_EXTENSIONS` | `[".pdf",".html"]` | File extensions to check | +| Variable | Default | Description | +| ------------------ | ------------------ | -------------------------------- | +| `PROBE_PREFIXES` | `["D","P"]` | Prefixes for gap/unknown numbers | +| `PROBE_EXTENSIONS` | `[".pdf",".html"]` | File extensions to check | ### Frontier -| Variable | Default | Description | -|----------|---------|-------------| -| `FRONTIER_WINDOW_ABOVE` | `60` | Numbers above effective frontier to probe every cycle | -| `FRONTIER_WINDOW_BELOW` | `30` | Numbers below effective frontier to probe every cycle | -| `FRONTIER_EXPLICIT_RANGES` | `[]` | Additional explicit ranges, e.g. `[{"min":4033,"max":4060}]` | -| `FRONTIER_GAP_THRESHOLD` | `50` | Max gap between consecutive P-numbers before treating a number as an outlier (prevents pre-assigned far-future numbers like P5000 from shifting the frontier) | +| Variable | Default | Description | +| -------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `FRONTIER_WINDOW_ABOVE` | `60` | Numbers above effective frontier to probe every cycle | +| `FRONTIER_WINDOW_BELOW` | `30` | Numbers below effective frontier to probe every cycle | +| `FRONTIER_EXPLICIT_RANGES` | `[]` | Additional explicit ranges, e.g. `[{"min":4033,"max":4060}]` | +| `FRONTIER_GAP_THRESHOLD` | `50` | Max gap between consecutive P-numbers before treating a number as an outlier (prevents pre-assigned far-future numbers like P5000 from shifting the frontier) | ### Hot Probing (every 30-min cycle) -| Variable | Default | Description | -|----------|---------|-------------| -| `HOT_LOOKBACK_MONTHS` | `6` | Papers with a date within this window are probed every cycle | -| `HOT_REVISION_DEPTH` | `2` | Revisions ahead of known latest to probe for hot papers | +| Variable | Default | Description | +| --------------------- | ------- | ------------------------------------------------------------ | +| `HOT_LOOKBACK_MONTHS` | `6` | Papers with a date within this window are probed every cycle | +| `HOT_REVISION_DEPTH` | `2` | Revisions ahead of known latest to probe for hot papers | ### Cold Probing (full coverage, distributed ≈ once per day) -| Variable | Default | Description | -|----------|---------|-------------| -| `COLD_REVISION_DEPTH` | `1` | Revisions ahead of known latest for cold papers | -| `COLD_CYCLE_DIVISOR` | `48` | Cold pool is split into N slices; each cycle probes 1 slice (48 × 30 min = 24 h) | -| `GAP_MAX_REV` | `1` | For gap/unknown numbers, probe R0 through this revision | +| Variable | Default | Description | +| --------------------- | ------- | -------------------------------------------------------------------------------- | +| `COLD_REVISION_DEPTH` | `1` | Revisions ahead of known latest for cold papers | +| `COLD_CYCLE_DIVISOR` | `48` | Cold pool is split into N slices; each cycle probes 1 slice (48 × 30 min = 24 h) | +| `GAP_MAX_REV` | `1` | For gap/unknown numbers, probe R0 through this revision | ### Timestamp-Based Alerting -| Variable | Default | Description | -|----------|---------|-------------| -| `ALERT_MODIFIED_HOURS` | `24` | Only notify for hits where the server's `Last-Modified` header is within this many hours of now. Falls back to "alert" when the header is absent. | +| Variable | Default | Description | +| ---------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ALERT_MODIFIED_HOURS` | `24` | Only notify for hits where the server's `Last-Modified` header is within this many hours of now. Falls back to "alert" when the header is absent. | ### HTTP Client -| Variable | Default | Description | -|----------|---------|-------------| -| `HTTP_CONCURRENCY` | `20` | Maximum simultaneous probe requests | -| `HTTP_TIMEOUT_SECONDS` | `10` | Request timeout for HEAD probes | -| `HTTP_USE_HTTP2` | `true` | Enable HTTP/2 for all requests | +| Variable | Default | Description | +| ---------------------- | ------- | ----------------------------------- | +| `HTTP_CONCURRENCY` | `20` | Maximum simultaneous probe requests | +| `HTTP_TIMEOUT_SECONDS` | `10` | Request timeout for HEAD probes | +| `HTTP_USE_HTTP2` | `true` | Enable HTTP/2 for all requests | ### Notifications -| Variable | Default | Description | -|----------|---------|-------------| -| `NOTIFICATION_CHANNEL` | `""` | Slack channel ID for general alerts (frontier hits, D→P transitions); empty = disabled | -| `NOTIFY_ON_FRONTIER_HIT` | `true` | Notify on recently modified draft near the frontier | -| `NOTIFY_ON_ANY_DRAFT` | `true` | Notify on any other recently modified draft | -| `NOTIFY_ON_DP_TRANSITION` | `true` | Notify when a tracked D-paper appears in the index as its published P counterpart | +| Variable | Default | Description | +| ------------------------- | ------- | -------------------------------------------------------------------------------------- | +| `NOTIFICATION_CHANNEL` | `""` | Slack channel ID for general alerts (frontier hits, D→P transitions); empty = disabled | +| `NOTIFY_ON_FRONTIER_HIT` | `true` | Notify on recently modified draft near the frontier | +| `NOTIFY_ON_ANY_DRAFT` | `true` | Notify on any other recently modified draft | +| `NOTIFY_ON_DP_TRANSITION` | `true` | Notify when a tracked D-paper appears in the index as its published P counterpart | > Personal watchlist matches (author or paper number) are always sent as a DM to the matching user — they are not posted to `NOTIFICATION_CHANNEL`. ### Storage -| Variable | Default | Description | -|----------|---------|-------------| -| `DATABASE_URL` | `""` | PostgreSQL DSN — required | -| `DATA_DIR` | `./data` | Directory for log files | -| `CACHE_TTL_HOURS` | `1` | How long the wg21.link index cache is considered fresh | +| Variable | Default | Description | +| ----------------- | -------- | ------------------------------------------------------ | +| `DATABASE_URL` | `""` | PostgreSQL DSN — required | +| `DATA_DIR` | `./data` | Directory for log files | +| `CACHE_TTL_HOURS` | `1` | How long the wg21.link index cache is considered fresh | ## Architecture @@ -294,25 +328,25 @@ paperscout-python/ ### PostgreSQL Schema -| Table | Purpose | -|-------|---------| -| `paper_cache` | TTL-cached wg21.link index JSON blob | -| `discovered_urls` | All URLs seen by the ISO prober with timestamps | -| `probe_miss_counts` | Exponential backoff counters per paper number | -| `poll_state` | Last-poll timestamp (singleton row) | -| `user_watchlist` | Per-user author/paper entries with type discrimination | +| Table | Purpose | +| ------------------- | ------------------------------------------------------ | +| `paper_cache` | TTL-cached wg21.link index JSON blob | +| `discovered_urls` | All URLs seen by the ISO prober with timestamps | +| `probe_miss_counts` | Exponential backoff counters per paper number | +| `poll_state` | Last-poll timestamp (singleton row) | +| `user_watchlist` | Per-user author/paper entries with type discrimination | ### Two-Frequency Probing Strategy Every P-number from 1 to the effective frontier is probed. Numbers are divided into a **hot** set (probed every 30 min) and a **cold** pool (probed once per day by distributing 1/48 of the pool each cycle). -| Frequency | What | Condition | Per-cycle URLs | -|-----------|------|-----------|----------------| -| **Hot** (every cycle) | Watchlist papers | union of all users' watched paper numbers | D-prefix, latest+1..+2, pdf+html | -| **Hot** (every cycle) | Frontier numbers | ±window around effective frontier | D+P, R0..R1 for unknowns; D, latest+1..+2 for known | -| **Hot** (every cycle) | Recently active papers | date within `HOT_LOOKBACK_MONTHS` | D-prefix, latest+1..+2, pdf+html | -| **Cold** (1/48 per cycle ≈ daily) | All other P-numbers | everything else | D-prefix, latest+1, pdf+html | -| **Cold** (1/48 per cycle) | Gap numbers (no index entry) | 1..frontier minus known | D+P, R0..R1, pdf+html | +| Frequency | What | Condition | Per-cycle URLs | +| --------------------------------- | ---------------------------- | ----------------------------------------- | --------------------------------------------------- | +| **Hot** (every cycle) | Watchlist papers | union of all users' watched paper numbers | D-prefix, latest+1..+2, pdf+html | +| **Hot** (every cycle) | Frontier numbers | ±window around effective frontier | D+P, R0..R1 for unknowns; D, latest+1..+2 for known | +| **Hot** (every cycle) | Recently active papers | date within `HOT_LOOKBACK_MONTHS` | D-prefix, latest+1..+2, pdf+html | +| **Cold** (1/48 per cycle ≈ daily) | All other P-numbers | everything else | D-prefix, latest+1, pdf+html | +| **Cold** (1/48 per cycle) | Gap numbers (no index entry) | 1..frontier minus known | D+P, R0..R1, pdf+html | Typical per-cycle request count: **~1,600–2,000 HEAD requests** (~8–10 s at 20 concurrent, 100 ms latency). A full sweep of all ~4,000 P-numbers completes within ~24 h of continuous 30-min polling. @@ -328,11 +362,11 @@ The `Last-Modified` timestamp is shown in every notification message. ## Data Sources -| Source | URL | What it covers | -|--------|-----|---------------| -| wg21.link | `https://wg21.link/index.json` | All published P/N papers with metadata | -| open-std.org | `https://www.open-std.org/jtc1/sc22/wg21/docs/papers/{year}/` | Yearly HTML tables (scraper defined, not yet scheduled) | -| isocpp.org | `https://isocpp.org/files/papers/{D\|P}{num}R{rev}.{pdf\|html}` | D-paper drafts (no index, requires probing) | +| Source | URL | What it covers | +| ------------ | --------------------------------------------------------------- | ------------------------------------------------------- | +| wg21.link | `https://wg21.link/index.json` | All published P/N papers with metadata | +| open-std.org | `https://www.open-std.org/jtc1/sc22/wg21/docs/papers/{year}/` | Yearly HTML tables (scraper defined, not yet scheduled) | +| isocpp.org | `https://isocpp.org/files/papers/{D\|P}{num}R{rev}.{pdf\|html}` | D-paper drafts (no index, requires probing) | ## Dependencies @@ -395,13 +429,17 @@ Coverage details are visible in the **Summary** tab of each workflow run (render ### Continuous Deployment -The `.github/workflows/cd.yml` workflow runs on every push to `main`: +The `.github/workflows/cd.yml` workflow runs on push to `main` or `develop` (and supports `workflow_dispatch` from either branch): -1. **Test** — single Python 3.12 pytest run as a gate -2. **Deploy** — SSHes into the server, runs `git pull`, and rebuilds the container with `docker compose up -d --build` -3. **Health check** — verifies `GET /health` returns 200 +1. **Test** — single Python 3.12 pytest run as a gate (re-uses the same coverage threshold as CI). +2. **Deploy** — single environment-driven job: + - Selects the **GitHub Environment** from the branch (`main` → `production`, `develop` → `staging`). + - SSHes using the environment-scoped secrets (`SERVER_HOST`, `SERVER_USER`, `SERVER_SSH_KEY`, optional `SERVER_PORT`). + - Reads per-environment **variables** (`DEPLOY_PATH`, `DEPLOY_BRANCH`, `HEALTH_PORT`) so the same workflow targets prod or staging without code changes. + - Runs `git fetch` + `git checkout` + `git pull --ff-only` against `DEPLOY_BRANCH` to keep deploys deterministic, then `docker compose up -d --build paperscout`. +3. **Health check** — bounded retry loop (12 × 5s) against `http://localhost:${HEALTH_PORT}/health`; the job fails if the endpoint never returns 200. -The app container connects to the host's shared PostgreSQL via `host.docker.internal`. Restarting the container has no effect on the database. +A `concurrency` group keyed by branch prevents overlapping deploys to the same environment. Production and staging targets stay independent because the secret values and variable values differ per environment. ### Database Backups @@ -411,4 +449,4 @@ The `.github/workflows/db-backup.yml` workflow runs daily at 3 AM UTC (and suppo 2. Uploads the dump to Google Cloud Storage (`gs://paperscout-backups/`) 3. Old backups are auto-pruned by a GCS lifecycle rule (30 days) -Required GitHub Secrets for CD and backups are documented in [`deploy/SERVER_SETUP.md`](deploy/SERVER_SETUP.md#9-github-secrets-checklist). +CD secrets and variables are configured per **GitHub Environment** (`production` and `staging`); see the table in [Deployment](#deployment). Other secrets (e.g. database backups) are documented in [`deploy/SERVER_SETUP.md`](deploy/SERVER_SETUP.md#9-github-secrets-checklist). diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..78c84c7 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,25 @@ +# Security policy + +## Supported versions + +| Version | Supported | +| ------- | ------------------ | +| `main` | Yes — latest fixes | +| Tags | Best-effort — use the latest tag for production | + +Older branches may not receive backports unless agreed with maintainers. + +## Reporting a vulnerability + +**Please do not** file a public GitHub issue for undisclosed security vulnerabilities. + +Instead: + +1. Use **[GitHub private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability)** for this repository (if enabled by org settings), **or** +2. Email or contact the **CppAlliance / repository maintainers** through an internal channel your organization documents for security. + +Maintainers will acknowledge receipt as soon as practical, investigate, and coordinate a fix and disclosure timeline with you. + +## Scope + +This policy covers the **paperscout** application code, Docker image, and GitHub workflows in this repository. Infrastructure (servers, PostgreSQL host hardening, Slack workspace policy) is out of scope here but should follow your organization’s security baseline — see [`deploy/SERVER_SETUP.md`](deploy/SERVER_SETUP.md) for deployment hardening notes. diff --git a/docs/handoff.md b/docs/handoff.md new file mode 100644 index 0000000..13d0bcb --- /dev/null +++ b/docs/handoff.md @@ -0,0 +1,69 @@ +# Maintainer handoff — paperscout + +This document captures **design intent**, **operational gotchas**, and **deferred work** so a second maintainer can operate and extend the service without tribal knowledge. For step-by-step setup, see [onboarding.md](onboarding.md) and the [README](../README.md). + +## Non-obvious design decisions + +### 1. Two-frequency hot vs cold ISO probing + +Every poll cycle could probe thousands of `isocpp.org` URLs. The prober splits P-numbers into: + +- **Hot** — frontier band, watchlist numbers, and papers with recent index dates: probed **every** cycle so new D-drafts near the action surface quickly. +- **Cold** — the long tail: each number is visited on a **rotating slice** (`COLD_CYCLE_DIVISOR` cycles ≈ one full pass per day by default). + +**Why:** Full HEAD sweep every 30 minutes would be noisy for operators and rough on isocpp.org; hot/cold keeps latency low where it matters while retaining eventual full coverage. See [README — Two-Frequency Probing Strategy](../README.md#two-frequency-probing-strategy). + +### 2. HEAD-only probes and Last-Modified gating + +ISO probing uses **HTTP HEAD**, not GET, to detect existence and metadata without downloading PDF/HTML bodies. + +**Why HEAD:** Drafts can be large; bandwidth and server load stay bounded. Alerts use the **`Last-Modified`** header so old files discovered for the first time do not spam Slack; missing header is treated as “recent” (first discovery). Implemented in [`ISOProber`](../src/paperscout/sources.py) and summarized in [README — Alerting by Last-Modified](../README.md#alerting-by-last-modified). + +### 3. D→P transition detection via stored probe state + +When the wg21 index gains a **new P** row, the monitor checks whether a matching **D** URL was previously recorded in `discovered_urls`. If so, it emits a **D→P transition** for notification. + +**Why:** The index alone does not tell you that *we* saw the draft first; probe history is the bridge. Logic lives in [`monitor.py`](../src/paperscout/monitor.py) (`DPTransition` / `poll_once`). + +### 4. Slack queue and HTTP 429 + +Outbound Slack messages go through a **background queue** (see [`scout.py`](../src/paperscout/scout.py)) so bursts from one poll do not violate Slack posting limits. The queue respects **HTTP 429** and `Retry-After`. + +**Why:** Bolt handlers must stay responsive; rate limits are easier to reason about in one place than ad hoc sleeps in notifiers. + +### 5. Watchlist DB work off the event loop + +`poll_once` uses `asyncio.to_thread` for `user_watchlist.matches_for_users` because that path uses **synchronous psycopg2** I/O. + +**Why:** Avoid blocking the asyncio loop during PostgreSQL-heavy match resolution while keeping a single-threaded pool model elsewhere. + +## Operational gotchas + +| Topic | What to know | +| ----- | -------------- | +| **isocpp.org** | Third-party availability and latency directly affect cycle time; long cycles increase sleep spacing via `POLL_OVERRUN_COOLDOWN_SECONDS` (see [onboarding — Scheduling](onboarding.md#scheduling-asyncio-loop)). | +| **HEAD volume** | Typical **~1,600–2,000** HEAD requests per cycle at default settings (README architecture section). Tune `HTTP_CONCURRENCY` / windows if needed. | +| **Slack 429** | Expected under burst; queue backs off using response headers — do not remove the queue “to simplify” without a replacement strategy. | +| **Docker + Postgres** | Containers reach the host DB via `host.docker.internal`; Postgres must **listen** and **pg_hba** must allow the Docker bridge — [SERVER_SETUP.md](../deploy/SERVER_SETUP.md). | +| **Logs vs DB** | Rotating files under `DATA_DIR`; durable probe/index/watchlist state in **PostgreSQL** only. | + +## Open TODOs and deferred items + +- **`ENABLE_BULK_OPENSTD` / open-std.org** — Code paths exist in [`sources.py`](../src/paperscout/sources.py); bulk open-std scheduling is **not** integrated into the main poll loop yet (README notes “not yet scheduled”). +- **Eval / roadmap items** — If your org keeps a separate eval or ticket backlog, link it here; this repo does not ship a frozen “eval” document. + +## Credentials and contacts + +| Secret / asset | Who to ask | +| -------------- | ---------- | +| **Slack app** (signing secret, bot token, app-level tokens if any) | Whoever administers the Slack workspace / app registration for this deployment (replace with a named role or mailing list). | +| **PostgreSQL** (production DSN, backup restore) | Infrastructure / DBA for the host that runs `DATABASE_URL` (see [SERVER_SETUP.md](../deploy/SERVER_SETUP.md), [db-backup workflow](../.github/workflows/db-backup.yml)). | +| **GitHub Environments** (SSH deploy, GCS backup credentials) | Documented in README **Deployment** and SERVER_SETUP **§9 GitHub secrets checklist**. | + +Replace generic phrases above with **CppAlliance**-specific contacts (Slack channel, email list, or on-call) as soon as they are approved for a public or internal wiki. + +## Related documents + +- [onboarding.md](onboarding.md) — linear setup for developers +- [CONTRIBUTING.md](../CONTRIBUTING.md) — PRs, hooks, releases +- [SECURITY.md](../SECURITY.md) — vulnerability reporting diff --git a/docs/onboarding.md b/docs/onboarding.md new file mode 100644 index 0000000..b662562 --- /dev/null +++ b/docs/onboarding.md @@ -0,0 +1,227 @@ +# Onboarding — paperscout + +This guide is ordered so a new developer can **run the test suite** and **start the service locally** without reading the whole [README](../README.md) first. For Slack app setup, production deploy, and deep architecture, follow links from each section. + +## Prerequisites + +- **Python** 3.10, 3.11, or 3.12 (`requires-python` in [pyproject.toml](../pyproject.toml)) +- **PostgreSQL** (local or remote) — the app stores all durable state in Postgres +- **Git** +- Optional: **Docker** + Docker Compose for containerized runs (see [Deployment](#deployment)) + +## Repository layout + +| Path | Role | +| ---- | ---- | +| [`src/paperscout/__init__.py`](../src/paperscout/__init__.py) | Package marker / version surface for the distribution. | +| [`src/paperscout/__main__.py`](../src/paperscout/__main__.py) | Entry point for `python -m paperscout`: logging, DB pool, Slack app, health server, async scheduler. | +| [`src/paperscout/config.py`](../src/paperscout/config.py) | Pydantic `Settings` — all configuration from environment / `.env`. | +| [`src/paperscout/models.py`](../src/paperscout/models.py) | `Paper` dataclass and enums for paper IDs, types, and file extensions. | +| [`src/paperscout/sources.py`](../src/paperscout/sources.py) | `WG21Index` (index fetch + cache), `ISOProber` (async HEAD probing of isocpp.org), open-std scraper hooks. | +| [`src/paperscout/monitor.py`](../src/paperscout/monitor.py) | `Scheduler`, index diffing, D→P transition detection, per-user watchlist match orchestration. | +| [`src/paperscout/scout.py`](../src/paperscout/scout.py) | Slack Bolt app, message queue, channel/DM notifications, command handlers. | +| [`src/paperscout/storage.py`](../src/paperscout/storage.py) | PostgreSQL-backed paper cache, probe state, and per-user watchlists. | +| [`src/paperscout/db.py`](../src/paperscout/db.py) | Connection pool setup and schema DDL. | +| [`src/paperscout/health.py`](../src/paperscout/health.py) | Small HTTP server exposing `GET /health` for orchestration and CD checks. | + +Supporting directories: [`tests/`](../tests/) (pytest), [`deploy/`](../deploy/) (nginx sample + server provisioning), [`.github/workflows/`](../.github/workflows/) (CI/CD). + +## Local development + +### 1. Clone and virtual environment + +```bash +git clone https://github.com/cppalliance/paperscout-python.git +cd paperscout-python +python -m venv .venv +source .venv/bin/activate # Windows Git Bash: source .venv/Scripts/activate +pip install -e ".[dev]" +``` + +### 2. PostgreSQL + +Create a database and user (example names; adjust as needed): + +```sql +CREATE USER paperscout WITH PASSWORD 'your-secret'; +CREATE DATABASE paperscout OWNER paperscout; +``` + +Full host provisioning (listen addresses, `pg_hba.conf`, Docker bridge) is in [deploy/SERVER_SETUP.md](../deploy/SERVER_SETUP.md) (especially §3 and “Allow Docker containers to connect”). + +### 3. Environment file + +```bash +cp .env.example .env +``` + +**Minimum to run the process** (Slack endpoints will not work until tokens and signing secret are set): + +- `DATABASE_URL` — e.g. `postgresql://paperscout:your-secret@localhost:5432/paperscout` +- For Slack: `SLACK_SIGNING_SECRET`, `SLACK_BOT_TOKEN` — see [README § Slack App Setup](../README.md#slack-app-setup) + +### 4. Run tests + +Fast run (no coverage gate): + +```bash +./run test +# or: make test +# or: python -m pytest tests/ -v +``` + +Same checks as CI, including the **90% coverage** floor: + +```bash +./run check +# or: make check +``` + +CI configuration: [.github/workflows/ci.yml](../.github/workflows/ci.yml) (Python 3.10–3.12 on `ubuntu-latest`, `main` only). + +### 5. Code quality hooks (recommended) + +Install [pre-commit](https://pre-commit.com/) hooks after `pip install -e ".[dev]"`: + +```bash +pre-commit install +pre-commit run --all-files +``` + +See [CONTRIBUTING.md](../CONTRIBUTING.md) for PR expectations. + +## Run the service locally + +```bash +python -m paperscout +``` + +- **Slack HTTP app** listens on `PORT` (default **3000**). +- **Health** endpoint listens on `health_port` from settings (default **8080**) — `GET /health`. + +For Slack Event Subscriptions you need a public URL (e.g. ngrok); see [README](../README.md#7-set-the-request-url). + +## Deployment (summary) + +Production-style flow: + +1. Configure `.env` on the server (or use `env_file` in Compose). +2. Build and start: + + ```bash + docker compose up -d --build + ``` + +3. Health check from the host (see [docker-compose.yml](../docker-compose.yml) port mappings): + + - App (Slack): `127.0.0.1:9100` → container `3000` + - Health: `127.0.0.1:9101` → container `8080` → e.g. `curl -sf http://127.0.0.1:9101/health` + +Full CD narrative, GitHub Environments, and branch mapping: [README — Deployment](../README.md#deployment). + +## nginx + +Use [deploy/paperscout.conf](../deploy/paperscout.conf) as a reference for TLS termination and proxying `443` → app `3000`, with `/health` routed to the health port. Step-by-step integration with an existing site is in [deploy/SERVER_SETUP.md](../deploy/SERVER_SETUP.md) (§4 nginx + TLS). + +## Environment variables (complete reference) + +Every key from [`.env.example`](../.env.example) is listed below. Names in `.env` use **SCREAMING_SNAKE_CASE**; the runtime [Settings](../src/paperscout/config.py) class maps them to lowercase fields. + +### Slack and server + +| Variable | Required | Default / example | Meaning | +| -------- | -------- | ----------------- | ------- | +| `SLACK_SIGNING_SECRET` | Yes (for Slack) | — | Slack app signing secret; verifies incoming requests. | +| `SLACK_BOT_TOKEN` | Yes (for Slack) | — | Bot User OAuth token (`xoxb-…`). | +| `PORT` | No | `3000` | Port for the Slack Bolt HTTP listener. | + +### Database + +| Variable | Required | Meaning | +| -------- | -------- | ------- | +| `DATABASE_URL` | Yes | PostgreSQL DSN, e.g. `postgresql://user:pass@host:5432/paperscout`. In Docker against host Postgres, `host.docker.internal` is typical (see `.env.example`). | + +### Scheduling and sources + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `POLL_INTERVAL_MINUTES` | `30` | Target wall-clock spacing between poll cycles (see [Scheduling](#scheduling-asyncio-loop) below). | +| `POLL_OVERRUN_COOLDOWN_SECONDS` | `300` | **Minimum** sleep after any cycle that ran longer than one interval — avoids hammering the network if a cycle overruns. | +| `ENABLE_BULK_WG21` | `true` | Fetch and parse wg21.link index each cycle when enabled. | +| `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org bulk fetch (not yet wired into the scheduler). | +| `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle when enabled. | + +### Probe prefixes / extensions + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `PROBE_PREFIXES` | `["D","P"]` | JSON list of URL prefixes for gap / unknown numbers. | +| `PROBE_EXTENSIONS` | `[".pdf",".html"]` | JSON list of file extensions to probe. | + +### Frontier + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `FRONTIER_WINDOW_ABOVE` | `60` | How many P-numbers above the effective frontier to treat as hot each cycle. | +| `FRONTIER_WINDOW_BELOW` | `30` | How many below the frontier window. | +| `FRONTIER_EXPLICIT_RANGES` | `[]` | JSON list of `{"min": n, "max": m}` extra hot ranges. | +| `FRONTIER_GAP_THRESHOLD` | `50` | Max gap between consecutive P-numbers before a number is treated as an outlier for frontier calculation. | + +### Hot / cold probing + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `HOT_LOOKBACK_MONTHS` | `6` | Papers with index dates in this window are probed every cycle (hot). | +| `HOT_REVISION_DEPTH` | `2` | Extra revision indices ahead of the known latest for hot numbers. | +| `COLD_REVISION_DEPTH` | `1` | Revisions ahead of known latest for cold pool. | +| `COLD_CYCLE_DIVISOR` | `48` | Cold pool split into this many slices; one slice per cycle (48×30 min ≈ 24 h full sweep). | +| `GAP_MAX_REV` | `1` | For gap/unknown numbers, probe revisions `R0` … `R` this value. | + +### Alerting and HTTP client + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `ALERT_MODIFIED_HOURS` | `24` | Only Slack-notify probe hits whose `Last-Modified` is within this many hours (see README). | +| `HTTP_CONCURRENCY` | `20` | Max concurrent async HTTP requests for probing. | +| `HTTP_TIMEOUT_SECONDS` | `10` | Per-request timeout. | +| `HTTP_USE_HTTP2` | `true` | Use HTTP/2 where supported. | + +### Notifications + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `NOTIFICATION_CHANNEL` | empty | Slack channel ID for shared alerts (frontier, D→P, etc.); empty disables channel posts. | +| `NOTIFY_ON_FRONTIER_HIT` | `true` | Notify on recent draft hits near the frontier. | +| `NOTIFY_ON_ANY_DRAFT` | `true` | Notify on other recent draft hits. | +| `NOTIFY_ON_DP_TRANSITION` | `true` | Notify when a tracked D URL’s paper appears as P in the index. | + +### Storage and logging + +| Variable | Default | Meaning | +| -------- | ------- | ------- | +| `DATA_DIR` | `./data` | Log directory (and local file layout); created if missing. | +| `CACHE_TTL_HOURS` | `1` | Staleness window for cached wg21 index blob in Postgres. | +| `LOG_LEVEL` | `INFO` | Console/file log level (`DEBUG`, `INFO`, `WARNING`, `ERROR`). | +| `LOG_RETENTION_DAYS` | `7` | Days of rotated log files to retain. | + +**Note:** `health_port` (default `8080`) exists in [Settings](../src/paperscout/config.py) but is not in `.env.example`; set `HEALTH_PORT` only if you add it to settings or extend `.env.example` in the future. + +## Scheduling (asyncio loop) + +The [`Scheduler`](../src/paperscout/monitor.py) runs inside the same asyncio event loop as the HTTP clients: + +1. **`seed()`** (first cycle only): loads the wg21 index if enabled, snapshots papers, optionally runs one ISO probe pass and records discoveries — **no Slack notifications** on this pass. +2. **`poll_once()`** on later cycles: refresh index, diff against previous snapshot, run ISO probe if enabled, compute D→P transitions, match watchlists, invoke the notify callback with a `PollResult`. +3. **`run_forever()`** loop: + - `interval = POLL_INTERVAL_MINUTES * 60` seconds (target spacing). + - After each `poll_once()`, measure elapsed time. + - `sleep_for = max(interval - elapsed, POLL_OVERRUN_COOLDOWN_SECONDS)` then `await asyncio.sleep(sleep_for)`. + - So: short cycles wait out the remainder of the interval; **long or failed cycles** still sleep at least `POLL_OVERRUN_COOLDOWN_SECONDS` before retrying. + +**Hot vs cold probing** (what runs inside each cycle) is documented in the README: [Two-Frequency Probing Strategy](../README.md#two-frequency-probing-strategy). + +## Where to go next + +- Maintainer context and ops notes: [handoff.md](handoff.md) +- Contributing and releases: [CONTRIBUTING.md](../CONTRIBUTING.md) +- Product and Slack: [README](../README.md) diff --git a/pyproject.toml b/pyproject.toml index 316f9c5..7687f70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ dev = [ "pytest", "pytest-asyncio", "pytest-cov", + "pre-commit", + "ruff", ] [tool.hatch.build.targets.wheel] @@ -42,3 +44,24 @@ exclude_lines = [ "if __name__ == .__main__.:", "raise NotImplementedError", ] + +[tool.ruff] +target-version = "py310" +line-length = 100 +src = ["src", "tests"] + +[tool.ruff.lint] +select = [ + "E", + "W", + "F", + "I", + "B", + "UP", +] +ignore = [ + "E501", +] + +[tool.ruff.format] +quote-style = "double" diff --git a/src/paperscout/__init__.py b/src/paperscout/__init__.py index 7c69677..014fd27 100644 --- a/src/paperscout/__init__.py +++ b/src/paperscout/__init__.py @@ -1,4 +1,4 @@ -from importlib.metadata import version, PackageNotFoundError +from importlib.metadata import PackageNotFoundError, version try: __version__ = version("paperscout") diff --git a/src/paperscout/__main__.py b/src/paperscout/__main__.py index b3c7a3a..d94a5c6 100644 --- a/src/paperscout/__main__.py +++ b/src/paperscout/__main__.py @@ -1,4 +1,5 @@ """Entry point: python -m paperscout""" + from __future__ import annotations import asyncio @@ -6,23 +7,21 @@ import logging.handlers import sys import threading -from pathlib import Path - from datetime import datetime, timezone +from pathlib import Path from .config import settings -from .scout import MessageQueue, create_app, notify_channel, notify_users, register_handlers from .db import init_db, init_pool from .health import start_health_server from .monitor import Scheduler +from .scout import MessageQueue, create_app, notify_channel, notify_users, register_handlers from .sources import ISOProber, WG21Index from .storage import ProbeState, UserWatchlist log = logging.getLogger("paperscout") -def _setup_logging(data_dir: Path, console_level: str = "INFO", - retention_days: int = 7) -> None: +def _setup_logging(data_dir: Path, console_level: str = "INFO", retention_days: int = 7) -> None: """Configure root logger with: • Console (stderr) — at *console_level*, for interactive monitoring. @@ -57,8 +56,7 @@ def _setup_logging(data_dir: Path, console_level: str = "INFO", root.addHandler(fh) root.addHandler(ch) - for lib in ("httpx", "httpcore", "slack_bolt", "slack_sdk", - "urllib3", "psycopg2"): + for lib in ("httpx", "httpcore", "slack_bolt", "slack_sdk", "urllib3", "psycopg2"): logging.getLogger(lib).setLevel(logging.WARNING) @@ -74,15 +72,20 @@ async def _async_main() -> None: log.info( "=== Paperscout starting port=%d poll=%dmin data=%s log=%s ===", - settings.port, settings.poll_interval_minutes, - data_dir, data_dir / "paperscout.log", + settings.port, + settings.poll_interval_minutes, + data_dir, + data_dir / "paperscout.log", ) log.info( "Settings: hot_lookback=%dmo hot_depth=%d cold_divisor=%d " "alert_hours=%d gap_max_rev=%d frontier_gap=%d", - settings.hot_lookback_months, settings.hot_revision_depth, - settings.cold_cycle_divisor, settings.alert_modified_hours, - settings.gap_max_rev, settings.frontier_gap_threshold, + settings.hot_lookback_months, + settings.hot_revision_depth, + settings.cold_cycle_divisor, + settings.alert_modified_hours, + settings.gap_max_rev, + settings.frontier_gap_threshold, ) if not settings.database_url: @@ -102,7 +105,8 @@ async def _async_main() -> None: mq = MessageQueue(app) mq.start() - paper_count_fn = lambda: len(index.papers) + def paper_count_fn() -> int: + return len(index.papers) def _on_poll_result(result): notify_channel(app, result, mq) @@ -121,7 +125,9 @@ def _on_poll_result(result): start_health_server(settings.health_port, launch_time, state, paper_count_fn) log.info("Starting Slack Bolt app on port %d", settings.port) bolt_thread = threading.Thread( - target=app.start, kwargs={"port": settings.port}, daemon=True, + target=app.start, + kwargs={"port": settings.port}, + daemon=True, ) bolt_thread.start() diff --git a/src/paperscout/config.py b/src/paperscout/config.py index 6ed93e6..f5b3f47 100644 --- a/src/paperscout/config.py +++ b/src/paperscout/config.py @@ -23,7 +23,7 @@ class Settings(BaseSettings): poll_interval_minutes: int = 30 # Minimum seconds to sleep after an overrun cycle (poll took longer than # poll_interval_minutes). Acts as a short cooldown before the next cycle. - poll_overrun_cooldown_seconds: int = 300 # 5 min + poll_overrun_cooldown_seconds: int = 300 # 5 min enable_bulk_wg21: bool = True enable_bulk_openstd: bool = True enable_iso_probe: bool = True diff --git a/src/paperscout/db.py b/src/paperscout/db.py index f6b9285..98068dd 100644 --- a/src/paperscout/db.py +++ b/src/paperscout/db.py @@ -1,9 +1,9 @@ """PostgreSQL connection pool and schema initialisation.""" + from __future__ import annotations import logging -import psycopg2 from psycopg2 import pool as pg_pool log = logging.getLogger(__name__) diff --git a/src/paperscout/health.py b/src/paperscout/health.py index 99376c6..2dc2485 100644 --- a/src/paperscout/health.py +++ b/src/paperscout/health.py @@ -1,12 +1,13 @@ """Lightweight HTTP health-check endpoint.""" + from __future__ import annotations import json import logging import threading +from collections.abc import Callable from datetime import datetime, timezone -from http.server import HTTPServer, BaseHTTPRequestHandler -from typing import Callable +from http.server import BaseHTTPRequestHandler, HTTPServer from . import __version__ @@ -32,18 +33,21 @@ def do_GET(self) -> None: get_disc = getattr(self.state, "get_all_discovered", lambda: {}) discovered = get_disc() - body = json.dumps({ - "version": __version__, - "uptime_seconds": int(uptime), - "launched_at": self.launch_time.isoformat(), - "papers_loaded": self.paper_count_fn(), - "last_poll": ( - datetime.fromtimestamp(last_poll, tz=timezone.utc).isoformat() - if last_poll else None - ), - "discovered_via_probe": len(discovered), - "iso_probe_enabled": settings.enable_iso_probe, - }).encode() + body = json.dumps( + { + "version": __version__, + "uptime_seconds": int(uptime), + "launched_at": self.launch_time.isoformat(), + "papers_loaded": self.paper_count_fn(), + "last_poll": ( + datetime.fromtimestamp(last_poll, tz=timezone.utc).isoformat() + if last_poll + else None + ), + "discovered_via_probe": len(discovered), + "iso_probe_enabled": settings.enable_iso_probe, + } + ).encode() self.send_response(200) self.send_header("Content-Type", "application/json") diff --git a/src/paperscout/monitor.py b/src/paperscout/monitor.py index 3387efb..7f49458 100644 --- a/src/paperscout/monitor.py +++ b/src/paperscout/monitor.py @@ -16,6 +16,7 @@ # ── Diff Engine ────────────────────────────────────────────────────────────── + @dataclass(slots=True) class DiffResult: new_papers: list[Paper] @@ -35,8 +36,12 @@ def diff_snapshots( new_papers.append(paper) else: old = previous[key] - if (old.title != paper.title or old.author != paper.author - or old.date != paper.date or old.long_link != paper.long_link): + if ( + old.title != paper.title + or old.author != paper.author + or old.date != paper.date + or old.long_link != paper.long_link + ): updated_papers.append(paper) new_papers.sort(key=lambda p: p.date or "", reverse=True) @@ -45,6 +50,7 @@ def diff_snapshots( # ── Per-User Matches ───────────────────────────────────────────────────────── + @dataclass class PerUserMatches: """Watchlist matches for a single Slack user in one poll cycle. @@ -52,12 +58,14 @@ class PerUserMatches: Each entry in *papers* and *probe_hits* is a ``(item, match_reason)`` tuple where ``match_reason`` is ``'author'`` or ``'paper'``. """ + papers: list[tuple[Paper, str]] = field(default_factory=list) probe_hits: list[tuple[ProbeHit, str]] = field(default_factory=list) # ── Poll Result ────────────────────────────────────────────────────────────── + @dataclass(slots=True) class DPTransition: """A D-paper draft that has been formally published as its P counterpart. @@ -67,6 +75,7 @@ class DPTransition: *last_modified -- server Last-Modified of the draft (Unix timestamp), or None *discovered_at* -- our wall-clock time when we first found the draft """ + paper: Paper draft_url: str last_modified: float | None @@ -89,6 +98,7 @@ def __init__( # ── Scheduler ──────────────────────────────────────────────────────────────── + class Scheduler: """Coordinates periodic polling: index refresh + ISO probing + notifications.""" @@ -160,13 +170,17 @@ async def poll_once(self) -> PollResult: for paper in diff.new_papers: log.info( "INDEX-NEW id=%-14s author=%-20s date=%s title=%r", - paper.id, paper.author or "?", paper.date or "?", + paper.id, + paper.author or "?", + paper.date or "?", (paper.title or "")[:80], ) for paper in diff.updated_papers: log.debug( "INDEX-UPD id=%-14s author=%-20s date=%s", - paper.id, paper.author or "?", paper.date or "?", + paper.id, + paper.author or "?", + paper.date or "?", ) probe_hits: list[ProbeHit] = [] @@ -174,13 +188,14 @@ async def poll_once(self) -> PollResult: probe_hits = await self.prober.run_cycle() recent_hits = [h for h in probe_hits if h.is_recent] - old_hits = [h for h in probe_hits if not h.is_recent] + old_hits = [h for h in probe_hits if not h.is_recent] if old_hits: log.info( "PROBE-OLD %d hits with Last-Modified outside %dh window " "(recorded to discovered, no alert)", - len(old_hits), self.cfg.alert_modified_hours, + len(old_hits), + self.cfg.alert_modified_hours, ) # D→P transitions @@ -189,28 +204,29 @@ async def poll_once(self) -> PollResult: if paper.number is None or paper.revision is None or paper.prefix != "P": continue for ext in self.cfg.probe_extensions: - d_url = ( - f"https://isocpp.org/files/papers/" - f"D{paper.number:04d}R{paper.revision}{ext}" - ) + d_url = f"https://isocpp.org/files/papers/D{paper.number:04d}R{paper.revision}{ext}" info = self.state.discovered_info(d_url) if info is not None: - dp_transitions.append(DPTransition( - paper=paper, - draft_url=d_url, - last_modified=info.get("last_modified"), - discovered_at=info.get("discovered_at", 0.0), - )) + dp_transitions.append( + DPTransition( + paper=paper, + draft_url=d_url, + last_modified=info.get("last_modified"), + discovered_at=info.get("discovered_at", 0.0), + ) + ) lm_ts = info.get("last_modified") disc_ts = info.get("discovered_at", 0.0) log.info( - "D-TO-P id=%s draft=%s " - "draft-lm=%s draft-discovered=%s", - paper.id, d_url, + "D-TO-P id=%s draft=%s draft-lm=%s draft-discovered=%s", + paper.id, + d_url, datetime.fromtimestamp(lm_ts, tz=timezone.utc).strftime("%Y-%m-%d") - if lm_ts else "unknown", + if lm_ts + else "unknown", datetime.fromtimestamp(disc_ts, tz=timezone.utc).strftime("%Y-%m-%d") - if disc_ts else "unknown", + if disc_ts + else "unknown", ) break @@ -224,7 +240,9 @@ async def poll_once(self) -> PollResult: for uid, m in per_user_matches.items(): log.info( "WATCHLIST-MATCH user=%s papers=%d probe_hits=%d", - uid, len(m.papers), len(m.probe_hits), + uid, + len(m.papers), + len(m.probe_hits), ) result = PollResult( @@ -242,21 +260,26 @@ async def poll_once(self) -> PollResult: "index-new=%d index-upd=%d " "probe-recent=%d probe-old=%d " "dp-transitions=%d users-notified=%d", - self._poll_count, elapsed, - len(diff.new_papers), len(diff.updated_papers), - len(recent_hits), len(old_hits), - len(dp_transitions), len(per_user_matches), + self._poll_count, + elapsed, + len(diff.new_papers), + len(diff.updated_papers), + len(recent_hits), + len(old_hits), + len(dp_transitions), + len(per_user_matches), ) return result async def run_forever(self) -> None: - interval = self.cfg.poll_interval_minutes * 60 - cooldown = self.cfg.poll_overrun_cooldown_seconds + interval = self.cfg.poll_interval_minutes * 60 + cooldown = self.cfg.poll_overrun_cooldown_seconds log.info( - "SCHEDULER-START interval=%dmin overrun_cooldown=%ds " - "iso_probe=%s wg21=%s", - self.cfg.poll_interval_minutes, cooldown, - self.cfg.enable_iso_probe, self.cfg.enable_bulk_wg21, + "SCHEDULER-START interval=%dmin overrun_cooldown=%ds iso_probe=%s wg21=%s", + self.cfg.poll_interval_minutes, + cooldown, + self.cfg.enable_iso_probe, + self.cfg.enable_bulk_wg21, ) while True: t0 = time.monotonic() @@ -269,6 +292,8 @@ async def run_forever(self) -> None: sleep_for = max(interval - elapsed, cooldown) log.info( "SCHEDULER-SLEEP sleep=%.0fs (poll=%.0fs interval=%ds)", - sleep_for, elapsed, interval, + sleep_for, + elapsed, + interval, ) await asyncio.sleep(sleep_for) diff --git a/src/paperscout/scout.py b/src/paperscout/scout.py index 18f071a..8da8ce5 100644 --- a/src/paperscout/scout.py +++ b/src/paperscout/scout.py @@ -11,8 +11,8 @@ from .config import settings from .models import Paper -from .monitor import DPTransition, PerUserMatches, PollResult -from .sources import ProbeHit, Tier +from .monitor import PollResult +from .sources import Tier from .storage import ProbeState, UserWatchlist log = logging.getLogger(__name__) @@ -30,6 +30,7 @@ def create_app() -> App: # ── Message Queue ───────────────────────────────────────────────────────────── + class MessageQueue: """Thread-safe, rate-limited Slack ``chat.postMessage`` queue. @@ -90,7 +91,8 @@ def _send_with_retry(self, channel: str, text: str, kwargs: dict) -> None: retry_after = int(exc.response.headers.get("Retry-After", "5")) log.warning( "MQ 429 rate-limited channel=%s retry_after=%ds", - channel, retry_after, + channel, + retry_after, ) time.sleep(retry_after) # Re-throttle per-channel timer after sleeping @@ -106,6 +108,7 @@ def _send_with_retry(self, channel: str, text: str, kwargs: dict) -> None: # ── Helpers ─────────────────────────────────────────────────────────────────── + def _paper_link(paper: Paper) -> str: url = paper.url or paper.long_link if not url: @@ -134,6 +137,7 @@ def _fmt_lm(lm: datetime | None) -> str: # ── Channel notification ────────────────────────────────────────────────────── + def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: """Post batch/non-watchlist events to the configured notification channel.""" channel = settings.notification_channel @@ -150,11 +154,13 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: d_link = f"<{tr.draft_url}|draft>" disc_str = ( datetime.fromtimestamp(tr.discovered_at, tz=timezone.utc).strftime("%Y-%m-%d") - if tr.discovered_at else "?" + if tr.discovered_at + else "?" ) lm_str = _fmt_lm( datetime.fromtimestamp(tr.last_modified, tz=timezone.utc) - if tr.last_modified else None + if tr.last_modified + else None ) lines.append( f"• {p_link} — {tr.paper.title}" @@ -164,7 +170,7 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: # Frontier probe hits frontier_hits = [h for h in result.probe_hits if h.tier == Tier.FRONTIER] - other_hits = [h for h in result.probe_hits if h.tier != Tier.FRONTIER] + other_hits = [h for h in result.probe_hits if h.tier != Tier.FRONTIER] if settings.notify_on_frontier_hit and frontier_hits: lines.append(f"*:mag: {len(frontier_hits)} new frontier draft(s):*") @@ -186,8 +192,11 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: batches = _batch_lines(lines, SLACK_MAX_TEXT) log.info( "NOTIFY channel=%s messages=%d dp=%d frontier=%d other=%d", - channel, len(batches), - len(result.dp_transitions), len(frontier_hits), len(other_hits), + channel, + len(batches), + len(result.dp_transitions), + len(frontier_hits), + len(other_hits), ) for batch in batches: mq.enqueue(channel, batch) @@ -195,6 +204,7 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: # ── Per-user DM notifications ───────────────────────────────────────────────── + def notify_users(app: App, result: PollResult, mq: MessageQueue) -> None: """Send DMs to users whose watchlist matched new papers or probe hits.""" if not result.per_user_matches: @@ -224,7 +234,10 @@ def notify_users(app: App, result: PollResult, mq: MessageQueue) -> None: batches = _batch_lines(lines, SLACK_MAX_TEXT) log.info( "NOTIFY-USER user=%s messages=%d papers=%d hits=%d", - user_id, len(batches), len(matches.papers), len(matches.probe_hits), + user_id, + len(batches), + len(matches.papers), + len(matches.probe_hits), ) for batch in batches: mq.enqueue(user_id, batch) @@ -249,6 +262,7 @@ def _batch_lines(lines: list[str], max_len: int) -> list[str]: # ── Command handlers ────────────────────────────────────────────────────────── + def register_handlers( app: App, user_watchlist: UserWatchlist, @@ -256,7 +270,6 @@ def register_handlers( paper_count_fn, launch_time: datetime | None = None, ) -> None: - def _dispatch(text: str, user_id: str, channel_type: str, say, reply_opts: dict) -> None: words = [w for w in text.split() if w] if not words: @@ -413,9 +426,13 @@ def _show_watchlist( def _handle_status(state: ProbeState, paper_count_fn, say, reply_opts: dict) -> None: - from datetime import datetime as _dt, timezone as _tz + from datetime import datetime as _dt + from datetime import timezone as _tz + last = state.last_poll - last_str = _dt.fromtimestamp(last, tz=_tz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") if last else "never" + last_str = ( + _dt.fromtimestamp(last, tz=_tz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") if last else "never" + ) say( text=( f"*Paperscout Status*\n" @@ -433,6 +450,7 @@ def _handle_status(state: ProbeState, paper_count_fn, say, reply_opts: dict) -> def _handle_version(say, reply_opts: dict) -> None: from . import __version__ + say(text=f"Paperscout v{__version__}", **reply_opts) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 721f342..823137a 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -4,11 +4,12 @@ import logging import re import time -from enum import Enum -from typing import Iterable +from collections.abc import Iterable from dataclasses import dataclass, field from datetime import date, datetime, timedelta, timezone from email.utils import parsedate_to_datetime +from enum import Enum + import httpx from .config import Settings, settings @@ -31,9 +32,9 @@ class WG21Index: def __init__(self, pool): self._cache = PaperCache(pool, ttl_hours=settings.cache_ttl_hours) self.papers: dict[str, Paper] = {} - self._max_rev: dict[int, int] = {} # P-number -> highest revision - self._max_p: int = 0 # absolute highest P-number - self._sorted_p_nums: list[int] = [] # sorted unique P-numbers, for gap analysis + self._max_rev: dict[int, int] = {} # P-number -> highest revision + self._max_p: int = 0 # absolute highest P-number + self._sorted_p_nums: list[int] = [] # sorted unique P-numbers, for gap analysis async def refresh(self) -> dict[str, Paper]: cached = self._cache.read_if_fresh() @@ -255,12 +256,12 @@ class ISOProber: # Keys that _stats is reset to at the start of every run_cycle(). _STATS_TEMPLATE: dict[str, int] = { "skipped_discovered": 0, # URL already in probe_state - "skipped_in_index": 0, # paper_id already in wg21.link index - "miss": 0, # server returned non-200 - "hit_recent": 0, # 200 + Last-Modified within alert window - "hit_old": 0, # 200 + Last-Modified outside alert window - "hit_no_lm": 0, # 200 + no Last-Modified header (treated as recent) - "error": 0, # httpx / network exception + "skipped_in_index": 0, # paper_id already in wg21.link index + "miss": 0, # server returned non-200 + "hit_recent": 0, # 200 + Last-Modified within alert window + "hit_old": 0, # 200 + Last-Modified outside alert window + "hit_no_lm": 0, # 200 + no Last-Modified header (treated as recent) + "error": 0, # httpx / network exception } def __init__( @@ -285,15 +286,17 @@ async def run_cycle(self) -> list[ProbeHit]: t0 = time.monotonic() urls = self._build_probe_list() - hot_count = sum( - 1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT) - ) + hot_count = sum(1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT)) cold_count = sum(1 for u in urls if u[1] == Tier.COLD) slice_idx = (self._cycle - 1) % self.cfg.cold_cycle_divisor log.info( "PROBE-START cycle=%d total=%d hot=%d cold=%d slice=%d/%d", - self._cycle, len(urls), hot_count, cold_count, - slice_idx, self.cfg.cold_cycle_divisor, + self._cycle, + len(urls), + hot_count, + cold_count, + slice_idx, + self.cfg.cold_cycle_divisor, ) sem = asyncio.Semaphore(self.cfg.http_concurrency) @@ -329,9 +332,17 @@ async def run_cycle(self) -> list[ProbeHit]: "PROBE-DONE cycle=%d elapsed=%.1fs total=%d " "hit=%d(recent=%d old=%d no-lm=%d) miss=%d " "skip-disc=%d skip-idx=%d err=%d", - self._cycle, elapsed, len(urls), - hit_total, s["hit_recent"], s["hit_old"], s["hit_no_lm"], - s["miss"], s["skipped_discovered"], s["skipped_in_index"], s["error"], + self._cycle, + elapsed, + len(urls), + hit_total, + s["hit_recent"], + s["hit_old"], + s["hit_no_lm"], + s["miss"], + s["skipped_discovered"], + s["skipped_in_index"], + s["error"], ) return hits @@ -343,9 +354,8 @@ def _build_probe_list(self) -> list[_Entry]: extra_p_numbers=self.state.paper_nums_from_discovered_iso_urls(), ) hot_known, hot_unknown = self._hot_numbers(frontier) - return ( - self._build_hot_list(frontier, hot_known, hot_unknown) - + self._build_cold_slice(self._cycle, frontier, hot_known, hot_unknown) + return self._build_hot_list(frontier, hot_known, hot_unknown) + self._build_cold_slice( + self._cycle, frontier, hot_known, hot_unknown ) def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: @@ -364,9 +374,7 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: # Recently active papers if self.cfg.hot_lookback_months > 0: - cutoff = date.today() - timedelta( - days=int(self.cfg.hot_lookback_months * 30.44) - ) + cutoff = date.today() - timedelta(days=int(self.cfg.hot_lookback_months * 30.44)) for p in self.index.papers.values(): if p.prefix != "P" or p.number is None or not p.date or p.date == "unknown": continue @@ -491,7 +499,7 @@ async def _probe_one( break except httpx.HTTPError as exc: if _attempt < _max_retries - 1: - await asyncio.sleep(0.5 * (2 ** _attempt)) + await asyncio.sleep(0.5 * (2**_attempt)) continue log.debug("ERR %s %s (after %d attempts)", url, exc, _max_retries) self._stats["error"] += 1 @@ -512,9 +520,7 @@ async def _probe_one( try: last_modified = parsedate_to_datetime(lm_str) threshold = timedelta(hours=self.cfg.alert_modified_hours) - is_recent = ( - datetime.now(timezone.utc) - last_modified - ) <= threshold + is_recent = (datetime.now(timezone.utc) - last_modified) <= threshold except Exception: pass else: @@ -522,13 +528,13 @@ async def _probe_one( # file; treat as recent so we don't silently drop it. is_recent = True - lm_display = ( - last_modified.strftime("%Y-%m-%d %H:%M UTC") - if last_modified else "no-lm" - ) + lm_display = last_modified.strftime("%Y-%m-%d %H:%M UTC") if last_modified else "no-lm" log.info( "HIT tier=%-10s recent=%-5s lm=%-20s %s", - tier, is_recent, lm_display, url, + tier, + is_recent, + lm_display, + url, ) if is_recent and last_modified is not None: @@ -544,8 +550,12 @@ async def _probe_one( front_text = await _fetch_front_text(client, prefix, num, rev) return ProbeHit( - url=url, prefix=prefix, number=num, - revision=rev, extension=ext, tier=tier, + url=url, + prefix=prefix, + number=num, + revision=rev, + extension=ext, + tier=tier, front_text=front_text, last_modified=last_modified, is_recent=is_recent, @@ -578,7 +588,8 @@ async def scrape_open_std(year: int | None = None) -> list[OpenStdEntry]: url = OPEN_STD_URL.format(year=year) try: async with httpx.AsyncClient( - http2=settings.http_use_http2, timeout=30.0, + http2=settings.http_use_http2, + timeout=30.0, ) as client: resp = await client.get(url) resp.raise_for_status() @@ -603,8 +614,13 @@ def _parse_open_std_html(html: str) -> list[OpenStdEntry]: author = re.sub(r"<[^>]+>", "", cells[2]).strip() doc_date = re.sub(r"<[^>]+>", "", cells[3]).strip() subgroup = re.sub(r"<[^>]+>", "", cells[6]).strip() if len(cells) > 6 else "" - entries.append(OpenStdEntry( - paper_id=paper_id, title=title, author=author, - doc_date=doc_date, subgroup=subgroup, - )) + entries.append( + OpenStdEntry( + paper_id=paper_id, + title=title, + author=author, + doc_date=doc_date, + subgroup=subgroup, + ) + ) return entries diff --git a/src/paperscout/storage.py b/src/paperscout/storage.py index 523fb5a..4c2cc31 100644 --- a/src/paperscout/storage.py +++ b/src/paperscout/storage.py @@ -1,16 +1,20 @@ """PostgreSQL-backed storage: PaperCache, ProbeState, UserWatchlist.""" + from __future__ import annotations import json import logging import re import time +from collections.abc import Generator from contextlib import contextmanager -from typing import TYPE_CHECKING, Generator +from typing import TYPE_CHECKING if TYPE_CHECKING: from psycopg2.pool import ThreadedConnectionPool + from .monitor import PerUserMatches + log = logging.getLogger(__name__) # isocpp.org draft URLs (same path shape as ISOProber) @@ -28,8 +32,9 @@ def iso_paper_number_from_discovered_url(url: str) -> int | None: # ── Connection helper ──────────────────────────────────────────────────────── + @contextmanager -def _conn(pool: "ThreadedConnectionPool") -> Generator: +def _conn(pool: ThreadedConnectionPool) -> Generator: conn = pool.getconn() try: yield conn @@ -54,7 +59,7 @@ class PaperCache: changes. """ - def __init__(self, pool: "ThreadedConnectionPool", ttl_hours: float = 1.0): + def __init__(self, pool: ThreadedConnectionPool, ttl_hours: float = 1.0): self._pool = pool self.ttl_seconds = ttl_hours * 3600 @@ -113,13 +118,14 @@ def write(self, data: dict) -> None: # ── Probe State ────────────────────────────────────────────────────────────── + class ProbeState: """PostgreSQL-backed probe state: discovered URLs, miss counters, last-poll. All existing methods are preserved with identical signatures. """ - def __init__(self, pool: "ThreadedConnectionPool"): + def __init__(self, pool: ThreadedConnectionPool): self._pool = pool self._ensure_poll_row() @@ -143,10 +149,7 @@ def get_all_discovered(self) -> dict[str, dict]: with conn.cursor() as cur: cur.execute("SELECT url, last_modified, discovered_at FROM discovered_urls") rows = cur.fetchall() - return { - url: {"last_modified": lm, "discovered_at": da} - for url, lm, da in rows - } + return {url: {"last_modified": lm, "discovered_at": da} for url, lm, da in rows} def mark_discovered(self, url: str, last_modified_ts: float | None = None) -> None: with _conn(self._pool) as conn: @@ -271,6 +274,7 @@ def save(self) -> None: # ── User Watchlist ─────────────────────────────────────────────────────────── + class UserWatchlist: """Per-user watchlist stored in the ``user_watchlist`` table. @@ -279,7 +283,7 @@ class UserWatchlist: auto-detected: pure digit strings → paper, anything else → author. """ - def __init__(self, pool: "ThreadedConnectionPool"): + def __init__(self, pool: ThreadedConnectionPool): self._pool = pool @staticmethod @@ -330,9 +334,7 @@ def get_all_watched_paper_nums(self) -> set[int]: """Return the union of all watched paper numbers across all users.""" with _conn(self._pool) as conn: with conn.cursor() as cur: - cur.execute( - "SELECT entry FROM user_watchlist WHERE entry_type = 'paper'" - ) + cur.execute("SELECT entry FROM user_watchlist WHERE entry_type = 'paper'") rows = cur.fetchall() result: set[int] = set() for (entry,) in rows: @@ -346,16 +348,14 @@ def _get_all_entries(self) -> list[tuple[str, str, str]]: """Return all rows as ``[(slack_user_id, entry, entry_type)]``.""" with _conn(self._pool) as conn: with conn.cursor() as cur: - cur.execute( - "SELECT slack_user_id, entry, entry_type FROM user_watchlist" - ) + cur.execute("SELECT slack_user_id, entry, entry_type FROM user_watchlist") return [(row[0], row[1], row[2]) for row in cur.fetchall()] def matches_for_users( self, - new_papers: list, # list[Paper] - probe_hits: list, # list[ProbeHit] - ) -> dict[str, "PerUserMatches"]: + new_papers: list, # list[Paper] + probe_hits: list, # list[ProbeHit] + ) -> dict[str, PerUserMatches]: """Compute per-user matched papers and probe hits. Returns a dict keyed by ``slack_user_id``. Only users with at least diff --git a/tests/conftest.py b/tests/conftest.py index 80dda73..522fb2a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,15 +3,14 @@ from __future__ import annotations import json as _json +import tempfile from pathlib import Path import pytest from paperscout.config import Settings -from paperscout.models import Paper -from paperscout.storage import ProbeState, UserWatchlist from paperscout.sources import WG21Index -import tempfile +from paperscout.storage import ProbeState # ── FakePool ───────────────────────────────────────────────────────────────── # An in-memory substitute for psycopg2.pool.ThreadedConnectionPool that @@ -116,9 +115,7 @@ def execute(self, sql: str, params=()): self._s.watchlist[key] = etype self.rowcount = 1 - elif ( - "DELETE FROM USER_WATCHLIST WHERE SLACK_USER_ID" in su and "AND ENTRY" in su - ): + elif "DELETE FROM USER_WATCHLIST WHERE SLACK_USER_ID" in su and "AND ENTRY" in su: uid, entry = params[0], params[1] key = (uid, entry) if key in self._s.watchlist: @@ -131,9 +128,7 @@ def execute(self, sql: str, params=()): self._rows = sorted(rows, key=lambda x: (x[1], x[0])) elif "SELECT ENTRY FROM USER_WATCHLIST WHERE ENTRY_TYPE" in su: - self._rows = [ - (e,) for (_, e), t in self._s.watchlist.items() if t == "paper" - ] + self._rows = [(e,) for (_, e), t in self._s.watchlist.items() if t == "paper"] elif "SELECT SLACK_USER_ID, ENTRY, ENTRY_TYPE FROM USER_WATCHLIST" in su: self._rows = [(u, e, t) for (u, e), t in self._s.watchlist.items()] diff --git a/tests/test_health.py b/tests/test_health.py index 9e2dcaf..25b8487 100644 --- a/tests/test_health.py +++ b/tests/test_health.py @@ -1,4 +1,5 @@ """Tests for paperscout.health.""" + from __future__ import annotations import json @@ -12,6 +13,7 @@ def _find_free_port() -> int: import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] diff --git a/tests/test_models.py b/tests/test_models.py index 831b1c0..8f209a8 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,13 +1,14 @@ """Tests for paperscout.models.""" + from __future__ import annotations import pytest from paperscout.models import FileExt, Paper, PaperPrefix, PaperType - # ── Enum sanity ────────────────────────────────────────────────────────────── + def test_paper_prefix_values(): assert PaperPrefix.D == "D" assert PaperPrefix.P == "P" @@ -36,6 +37,7 @@ def test_file_ext_values(): # ── Paper properties: P-prefix ─────────────────────────────────────────────── + def test_paper_p_number_prefix_revision(): p = Paper(id="P2300R10") assert p.number == 2300 @@ -59,6 +61,7 @@ def test_paper_d_prefix(): # ── Paper properties: N-prefix ─────────────────────────────────────────────── + def test_paper_n_number(): p = Paper(id="N4950") assert p.number == 4950 @@ -68,13 +71,17 @@ def test_paper_n_number(): # ── Paper properties: issue tracker prefixes ───────────────────────────────── -@pytest.mark.parametrize("paper_id,expected_prefix,expected_num", [ - ("CWG123", "CWG", 123), - ("EWG456", "EWG", 456), - ("LWG789", "LWG", 789), - ("LEWG42", "LEWG", 42), - ("FS10", "FS", 10), -]) + +@pytest.mark.parametrize( + "paper_id,expected_prefix,expected_num", + [ + ("CWG123", "CWG", 123), + ("EWG456", "EWG", 456), + ("LWG789", "LWG", 789), + ("LEWG42", "LEWG", 42), + ("FS10", "FS", 10), + ], +) def test_paper_issue_prefixes(paper_id, expected_prefix, expected_num): p = Paper(id=paper_id) assert p.prefix == expected_prefix @@ -84,6 +91,7 @@ def test_paper_issue_prefixes(paper_id, expected_prefix, expected_num): # ── Paper properties: unknown IDs ──────────────────────────────────────────── + def test_paper_unknown_id(): p = Paper(id="UNKNOWN") assert p.number is None @@ -100,6 +108,7 @@ def test_paper_empty_id(): # ── Paper.from_index_entry ─────────────────────────────────────────────────── + def test_from_index_entry_full(): entry = { "title": "Test Paper", @@ -176,6 +185,7 @@ def test_from_index_entry_standing_document_type(): # ── Paper dataclass defaults ────────────────────────────────────────────────── + def test_paper_default_fields(): p = Paper(id="P1234R0") assert p.title == "" diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 4fbcfbe..cf63ec4 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -1,8 +1,9 @@ """Tests for paperscout.monitor.""" + from __future__ import annotations import asyncio -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -23,8 +24,12 @@ def _recent_hit(**kwargs) -> ProbeHit: defaults = dict( url="https://isocpp.org/files/papers/D9999R0.pdf", - prefix="D", number=9999, revision=0, extension=".pdf", - tier="frontier", is_recent=True, + prefix="D", + number=9999, + revision=0, + extension=".pdf", + tier="frontier", + is_recent=True, ) defaults.update(kwargs) return ProbeHit(**defaults) @@ -33,8 +38,12 @@ def _recent_hit(**kwargs) -> ProbeHit: def _old_hit(**kwargs) -> ProbeHit: defaults = dict( url="https://isocpp.org/files/papers/D8888R0.pdf", - prefix="D", number=8888, revision=0, extension=".pdf", - tier="cold", is_recent=False, + prefix="D", + number=8888, + revision=0, + extension=".pdf", + tier="cold", + is_recent=False, last_modified=datetime.now(timezone.utc) - timedelta(days=30), ) defaults.update(kwargs) @@ -43,6 +52,7 @@ def _old_hit(**kwargs) -> ProbeHit: # ── diff_snapshots ──────────────────────────────────────────────────────────── + class TestDiffSnapshots: def _paper(self, pid, **kwargs) -> Paper: defaults = dict(title="T", author="A", date="2024-01-01") @@ -89,8 +99,8 @@ def test_new_papers_sorted_by_date_descending(self): prev = {} curr = { "P2300R10": self._paper("P2300R10", date="2024-01-01"), - "P2301R0": self._paper("P2301R0", date="2024-06-01"), - "P2302R0": self._paper("P2302R0", date="2024-03-01"), + "P2301R0": self._paper("P2301R0", date="2024-06-01"), + "P2302R0": self._paper("P2302R0", date="2024-03-01"), } result = diff_snapshots(prev, curr) dates = [p.date for p in result.new_papers] @@ -103,6 +113,7 @@ def test_empty_to_empty(self): # ── PollResult ──────────────────────────────────────────────────────────────── + class TestPollResult: def test_defaults(self): diff = DiffResult(new_papers=[], updated_papers=[]) @@ -112,6 +123,7 @@ def test_defaults(self): def test_explicit_dp_transitions(self): from paperscout.monitor import DPTransition + diff = DiffResult(new_papers=[], updated_papers=[]) paper = Paper(id="P2300R11") tr = DPTransition(paper=paper, draft_url="http://x", last_modified=None, discovered_at=0.0) @@ -128,6 +140,7 @@ def test_explicit_per_user_matches(self): # ── Scheduler ───────────────────────────────────────────────────────────────── + def _make_scheduler(fake_pool, **cfg_overrides): index = MagicMock(spec=WG21Index) index.refresh = AsyncMock() @@ -139,8 +152,11 @@ def _make_scheduler(fake_pool, **cfg_overrides): state = ProbeState(fake_pool) cfg = make_test_settings(**cfg_overrides) scheduler = Scheduler( - index=index, prober=prober, - user_watchlist=user_watchlist, state=state, cfg=cfg, + index=index, + prober=prober, + user_watchlist=user_watchlist, + state=state, + cfg=cfg, ) return scheduler, index, prober, user_watchlist, state @@ -187,8 +203,9 @@ async def test_poll_once_detects_dp_transition(self, fake_pool): draft_url = "https://isocpp.org/files/papers/D9999R0.pdf" state.mark_discovered(draft_url, last_modified_ts=1_700_000_000.0) - new_paper = Paper(id="P9999R0", title="New Published Paper", - author="Author", date="2025-01-01") + new_paper = Paper( + id="P9999R0", title="New Published Paper", author="Author", date="2025-01-01" + ) index.papers = {"P9999R0": new_paper} prober.run_cycle = AsyncMock(return_value=[]) @@ -223,6 +240,7 @@ async def test_poll_once_no_dp_transition_when_no_draft(self, fake_pool): async def test_poll_once_dp_transition_logged(self, fake_pool, caplog): import logging + scheduler, index, prober, _, state = _make_scheduler(fake_pool) await scheduler.poll_once() @@ -247,6 +265,7 @@ async def test_poll_count_increments(self, fake_pool): async def test_poll_once_logs_updated_papers(self, fake_pool, caplog): import logging + scheduler, index, prober, _, _ = _make_scheduler(fake_pool) await scheduler.poll_once() @@ -261,6 +280,7 @@ async def test_poll_once_logs_updated_papers(self, fake_pool, caplog): async def test_poll_old_hits_logged(self, fake_pool, caplog): import logging + scheduler, index, prober, _, _ = _make_scheduler(fake_pool) await scheduler.poll_once() old = _old_hit() @@ -305,8 +325,8 @@ async def test_poll_once_calls_notify_callback(self, fake_pool): notified = [] scheduler, _, _, _, _ = _make_scheduler(fake_pool) scheduler.notify_callback = notified.append - await scheduler.poll_once() # seed - await scheduler.poll_once() # real poll + await scheduler.poll_once() # seed + await scheduler.poll_once() # real poll assert len(notified) == 1 async def test_poll_once_skips_refresh_when_disabled(self, fake_pool): diff --git a/tests/test_scout.py b/tests/test_scout.py index 5f10869..d4b13a6 100644 --- a/tests/test_scout.py +++ b/tests/test_scout.py @@ -1,17 +1,13 @@ """Tests for paperscout.scout.""" + from __future__ import annotations -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock, patch -import pytest - from paperscout.models import Paper from paperscout.monitor import DiffResult, DPTransition, PerUserMatches, PollResult -from paperscout.sources import ProbeHit -from paperscout.storage import ProbeState, UserWatchlist from paperscout.scout import ( - MessageQueue, _batch_lines, _fmt_lm, _format_uptime, @@ -27,10 +23,12 @@ notify_users, register_handlers, ) - +from paperscout.sources import ProbeHit +from paperscout.storage import ProbeState, UserWatchlist # ── Helpers ─────────────────────────────────────────────────────────────────── + def _empty_diff() -> DiffResult: return DiffResult(new_papers=[], updated_papers=[]) @@ -70,8 +68,12 @@ def _make_settings(channel="C123456", **overrides): def _recent_hit(tier="frontier", number=9999, **kwargs) -> ProbeHit: defaults = dict( url=f"https://isocpp.org/files/papers/D{number:04d}R0.pdf", - prefix="D", number=number, revision=0, extension=".pdf", - tier=tier, is_recent=True, + prefix="D", + number=number, + revision=0, + extension=".pdf", + tier=tier, + is_recent=True, last_modified=datetime.now(timezone.utc) - timedelta(hours=2), ) defaults.update(kwargs) @@ -80,6 +82,7 @@ def _recent_hit(tier="frontier", number=9999, **kwargs) -> ProbeHit: # ── _fmt_lm ─────────────────────────────────────────────────────────────────── + class TestFmtLm: def test_none(self): assert "unknown" in _fmt_lm(None) @@ -99,6 +102,7 @@ def test_days_ago_shows_date(self): # ── _paper_link / _hit_label ────────────────────────────────────────────────── + class TestHelpers: def test_paper_link_uses_url(self): paper = Paper(id="P2300R10", url="https://wg21.link/P2300R10") @@ -115,13 +119,13 @@ def test_paper_link_synthesises_wg21_url(self): assert "|P2300R10>" in link def test_hit_label(self): - label = _hit_label("https://isocpp.org/files/papers/D2300R11.pdf", - "D", 2300, 11, ".pdf") + label = _hit_label("https://isocpp.org/files/papers/D2300R11.pdf", "D", 2300, 11, ".pdf") assert label == "" # ── notify_channel ──────────────────────────────────────────────────────────── + class TestNotifyChannel: def test_no_channel_returns_silently(self): app = MagicMock() @@ -203,11 +207,18 @@ def test_last_modified_shown_in_batch(self): def test_dp_all_transitions_are_batched(self): app = MagicMock() mq = MagicMock() - paper = Paper(id="P2300R11", title="Senders", author="Unknown Author", - url="https://wg21.link/P2300R11") - tr = DPTransition(paper=paper, - draft_url="https://isocpp.org/files/papers/D2300R11.pdf", - last_modified=1_700_000_000.0, discovered_at=1_699_900_000.0) + paper = Paper( + id="P2300R11", + title="Senders", + author="Unknown Author", + url="https://wg21.link/P2300R11", + ) + tr = DPTransition( + paper=paper, + draft_url="https://isocpp.org/files/papers/D2300R11.pdf", + last_modified=1_700_000_000.0, + discovered_at=1_699_900_000.0, + ) result = _make_result(dp_transitions=[tr]) with patch("paperscout.scout.settings", _make_settings()): notify_channel(app, result, mq) @@ -220,9 +231,12 @@ def test_dp_suppressed_when_disabled(self): app = MagicMock() mq = MagicMock() paper = Paper(id="P2300R11", title="X", author="Y") - tr = DPTransition(paper=paper, - draft_url="https://isocpp.org/files/papers/D2300R11.pdf", - last_modified=None, discovered_at=0.0) + tr = DPTransition( + paper=paper, + draft_url="https://isocpp.org/files/papers/D2300R11.pdf", + last_modified=None, + discovered_at=0.0, + ) result = _make_result(dp_transitions=[tr]) with patch("paperscout.scout.settings", _make_settings(notify_on_dp_transition=False)): notify_channel(app, result, mq) @@ -232,9 +246,12 @@ def test_dp_batch_contains_draft_link(self): app = MagicMock() mq = MagicMock() paper = Paper(id="P9999R0", title="Foo", author="Bar", url="") - tr = DPTransition(paper=paper, - draft_url="https://isocpp.org/files/papers/D9999R0.pdf", - last_modified=None, discovered_at=0.0) + tr = DPTransition( + paper=paper, + draft_url="https://isocpp.org/files/papers/D9999R0.pdf", + last_modified=None, + discovered_at=0.0, + ) result = _make_result(dp_transitions=[tr]) with patch("paperscout.scout.settings", _make_settings()): notify_channel(app, result, mq) @@ -245,6 +262,7 @@ def test_dp_batch_contains_draft_link(self): # ── notify_users ────────────────────────────────────────────────────────────── + class TestNotifyUsers: def test_no_matches_posts_nothing(self): app = MagicMock() @@ -256,8 +274,9 @@ def test_no_matches_posts_nothing(self): def test_author_match_sends_dm(self): app = MagicMock() mq = MagicMock() - paper = Paper(id="P2300R11", title="Senders", author="Eric Niebler", - url="https://wg21.link/P2300R11") + paper = Paper( + id="P2300R11", title="Senders", author="Eric Niebler", url="https://wg21.link/P2300R11" + ) pum = PerUserMatches(papers=[(paper, "author")], probe_hits=[]) result = _make_result(per_user_matches={"U123": pum}) notify_users(app, result, mq) @@ -270,8 +289,7 @@ def test_author_match_sends_dm(self): def test_paper_match_sends_dm(self): app = MagicMock() mq = MagicMock() - paper = Paper(id="P2300R11", title="X", author="Someone", - url="https://wg21.link/P2300R11") + paper = Paper(id="P2300R11", title="X", author="Someone", url="https://wg21.link/P2300R11") pum = PerUserMatches(papers=[(paper, "paper")], probe_hits=[]) result = _make_result(per_user_matches={"U456": pum}) notify_users(app, result, mq) @@ -304,6 +322,7 @@ def test_multiple_users_get_separate_dms(self): # ── _batch_lines ────────────────────────────────────────────────────────────── + class TestBatchLines: def test_single_batch_when_small(self): batches = _batch_lines(["line1", "line2", "line3"], max_len=1000) @@ -324,6 +343,7 @@ def test_single_line_exceeding_limit(self): # ── _reply_opts ─────────────────────────────────────────────────────────────── + class TestReplyOpts: def test_no_thread(self): opts = _reply_opts({"ts": "123"}) @@ -337,6 +357,7 @@ def test_with_thread(self): # ── _handle_watchlist ───────────────────────────────────────────────────────── + class TestHandleWatchlist: def test_add_new_author(self, fake_pool): say = MagicMock() @@ -415,6 +436,7 @@ def test_reply_opts_forwarded(self, fake_pool): # ── _show_watchlist ─────────────────────────────────────────────────────────── + class TestShowWatchlist: def test_empty_watchlist(self, fake_pool): say = MagicMock() @@ -441,6 +463,7 @@ def test_shows_type_labels(self, fake_pool): # ── _handle_status ──────────────────────────────────────────────────────────── + class TestHandleStatus: def test_status_never_polled(self, fake_pool): state = ProbeState(fake_pool) @@ -462,6 +485,7 @@ def test_status_after_poll(self, fake_pool): # ── register_handlers ───────────────────────────────────────────────────────── + class TestRegisterHandlers: def _setup(self, fake_pool): app = MagicMock() @@ -471,6 +495,7 @@ def capture_event(name): def decorator(fn): registered[name] = fn return fn + return decorator app.event.side_effect = capture_event @@ -556,8 +581,12 @@ def test_app_mention_channel_watchlist_silently_ignored(self, fake_pool): registered, _, _ = self._setup(fake_pool) say = MagicMock() registered["app_mention"]( - event={"text": "<@U1> watchlist list", "ts": "1", - "channel_type": "channel", "user": "U1"}, + event={ + "text": "<@U1> watchlist list", + "ts": "1", + "channel_type": "channel", + "user": "U1", + }, context={"bot_user_id": "U1"}, say=say, ) @@ -567,8 +596,12 @@ def test_message_mpim_watchlist_gets_error(self, fake_pool): registered, _, _ = self._setup(fake_pool) say = MagicMock() registered["message"]( - event={"text": "<@U1> watchlist add niebler", "channel_type": "mpim", - "ts": "1", "user": "U1"}, + event={ + "text": "<@U1> watchlist add niebler", + "channel_type": "mpim", + "ts": "1", + "user": "U1", + }, context={"bot_user_id": "U1"}, say=say, ) @@ -580,8 +613,7 @@ def test_message_mpim_status_responds(self, fake_pool): say = MagicMock() with patch("paperscout.scout.settings", _make_settings()): registered["message"]( - event={"text": "<@U1> status", "channel_type": "mpim", - "ts": "1", "user": "U1"}, + event={"text": "<@U1> status", "channel_type": "mpim", "ts": "1", "user": "U1"}, context={"bot_user_id": "U1"}, say=say, ) @@ -592,8 +624,12 @@ def test_message_subtype_ignored(self, fake_pool): registered, _, _ = self._setup(fake_pool) say = MagicMock() registered["message"]( - event={"text": "status", "subtype": "message_changed", "channel_type": "im", - "user": "U1"}, + event={ + "text": "status", + "subtype": "message_changed", + "channel_type": "im", + "user": "U1", + }, context={"bot_user_id": "U1"}, say=say, ) @@ -663,6 +699,7 @@ def test_dispatch_empty_text(self, fake_pool): # ── _handle_version ─────────────────────────────────────────────────────────── + class TestHandleVersion: def test_version_contains_version_string(self): say = MagicMock() @@ -680,6 +717,7 @@ def test_version_forwards_reply_opts(self): # ── _format_uptime / _handle_uptime ────────────────────────────────────────── + class TestUptime: def test_format_uptime_minutes_only(self): delta = timedelta(minutes=5) @@ -724,6 +762,7 @@ def test_handle_uptime_forwards_reply_opts(self): # ── dispatch: version / uptime ──────────────────────────────────────────────── + class TestDispatchVersionUptime: def _setup(self, fake_pool, launch_time=None): app = MagicMock() @@ -733,6 +772,7 @@ def capture_event(name): def decorator(fn): registered[name] = fn return fn + return decorator app.event.side_effect = capture_event @@ -781,9 +821,11 @@ def test_help_mentions_version_and_uptime(self, fake_pool): # ── create_app ──────────────────────────────────────────────────────────────── + class TestCreateApp: def test_create_app_uses_settings(self): from paperscout.scout import create_app + mock_settings = MagicMock() mock_settings.slack_bot_token = "xoxb-test" mock_settings.slack_signing_secret = "secret" diff --git a/tests/test_sources.py b/tests/test_sources.py index a08c684..95adfe3 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -1,4 +1,5 @@ """Tests for paperscout.sources.""" + from __future__ import annotations import asyncio @@ -12,8 +13,6 @@ from paperscout.models import Paper from paperscout.sources import ( ISOProber, - OpenStdEntry, - ProbeHit, WG21Index, _fetch_front_text, _fetch_pdf_text, @@ -33,8 +32,10 @@ def _mock_wl(paper_nums=None): # ── Helpers ────────────────────────────────────────────────────────────────── -def _make_response(status: int = 200, json_data=None, text: str = "", - last_modified: datetime | None = None) -> MagicMock: + +def _make_response( + status: int = 200, json_data=None, text: str = "", last_modified: datetime | None = None +) -> MagicMock: resp = MagicMock() resp.status_code = status resp.json = MagicMock(return_value=json_data or {}) @@ -57,7 +58,7 @@ def _make_stream_cm(status: int = 404, chunks: list[bytes] | None = None) -> Asy resp.status_code = status async def _aiter_bytes(chunk_size=65536): - for chunk in (chunks or []): + for chunk in chunks or []: yield chunk resp.aiter_bytes = _aiter_bytes @@ -85,6 +86,7 @@ def _old_lm() -> datetime: # ── WG21Index ──────────────────────────────────────────────────────────────── + class TestWG21Index: async def test_refresh_downloads_when_no_cache(self, fake_pool): index = WG21Index(fake_pool) @@ -228,6 +230,7 @@ def test_parse_ignores_non_dict_entries(self, fake_pool): # ── _fetch_front_text ───────────────────────────────────────────────────────── + class TestFetchFrontText: async def test_returns_plain_text_on_success(self): html = "

Author: Eric Niebler

" @@ -258,6 +261,7 @@ async def test_truncates_to_1000_words(self): # ── _fetch_pdf_text ─────────────────────────────────────────────────────────── + class TestFetchPdfText: async def test_returns_empty_on_non_200(self): client = _make_async_client(stream_cm=_make_stream_cm(404)) @@ -266,6 +270,7 @@ async def test_returns_empty_on_non_200(self): async def test_returns_empty_when_fitz_missing(self): import sys + client = _make_async_client(stream_cm=_make_stream_cm(200, chunks=[b"%PDF-fake"])) with patch.dict(sys.modules, {"fitz": None}): result = await _fetch_pdf_text(client, "https://example.com/test.pdf") @@ -280,6 +285,7 @@ async def test_returns_empty_on_stream_exception(self): async def test_respects_byte_cap(self): """stream() should be cut off after _PDF_MAX_BYTES; no crash.""" from paperscout.sources import _PDF_MAX_BYTES + big_chunk = b"x" * (_PDF_MAX_BYTES + 1) # Even though the chunk exceeds the cap, _fetch_pdf_text must not raise. # Passing invalid PDF bytes → fitz raises → caught → returns "". @@ -337,8 +343,11 @@ async def test_fetch_front_text_falls_back_to_pdf(self): # ── ISOProber: hot/cold list builders ──────────────────────────────────────── + class TestISOProberLists: - def _make_prober(self, fake_pool, watchlist_nums=None, **cfg_overrides) -> tuple[ISOProber, WG21Index, ProbeState]: + def _make_prober( + self, fake_pool, watchlist_nums=None, **cfg_overrides + ) -> tuple[ISOProber, WG21Index, ProbeState]: index = WG21Index(fake_pool) state = ProbeState(fake_pool) cfg = make_test_settings(**cfg_overrides) @@ -437,9 +446,11 @@ def test_hot_recent_paper_by_date(self, fake_pool): recent_date = (date.today() - timedelta(days=30)).isoformat() # _parse_and_index updates _max_rev/_sorted_p_nums but not self.papers; # assign both so that the date-based hot filter can find the paper. - index.papers = index._parse_and_index({ - "P5000R2": {"title": "T", "date": recent_date, "type": "paper"}, - }) + index.papers = index._parse_and_index( + { + "P5000R2": {"title": "T", "date": recent_date, "type": "paper"}, + } + ) frontier = index.effective_frontier() hot_known, _ = prober._hot_numbers(frontier) assert 5000 in hot_known @@ -452,9 +463,11 @@ def test_hot_old_paper_not_included(self, fake_pool): frontier_window_below=0, ) old_date = (date.today() - timedelta(days=365)).isoformat() - index._parse_and_index({ - "P5000R2": {"title": "T", "date": old_date, "type": "paper"}, - }) + index._parse_and_index( + { + "P5000R2": {"title": "T", "date": old_date, "type": "paper"}, + } + ) frontier = index.effective_frontier() hot_known, _ = prober._hot_numbers(frontier) assert 5000 not in hot_known @@ -555,8 +568,9 @@ def test_hot_numbers_explicit_range(self, fake_pool): assert 200 in hot_unknown or 200 in hot_known def test_hot_paper_skipped_when_no_date(self, fake_pool): - prober, index, _ = self._make_prober(fake_pool, hot_lookback_months=6, - frontier_window_above=0, frontier_window_below=0) + prober, index, _ = self._make_prober( + fake_pool, hot_lookback_months=6, frontier_window_above=0, frontier_window_below=0 + ) # Paper with no date should be silently skipped (the `continue` branch) index.papers = index._parse_and_index({"P6000R0": {"title": "T", "type": "paper"}}) frontier = index.effective_frontier() @@ -564,8 +578,9 @@ def test_hot_paper_skipped_when_no_date(self, fake_pool): assert 6000 not in hot_known def test_hot_paper_skipped_when_bad_date(self, fake_pool): - prober, index, _ = self._make_prober(fake_pool, hot_lookback_months=6, - frontier_window_above=0, frontier_window_below=0) + prober, index, _ = self._make_prober( + fake_pool, hot_lookback_months=6, frontier_window_above=0, frontier_window_below=0 + ) index.papers = index._parse_and_index( {"P6001R0": {"title": "T", "date": "not-a-date", "type": "paper"}} ) @@ -575,8 +590,11 @@ def test_hot_paper_skipped_when_bad_date(self, fake_pool): def test_tier_label_recent_for_non_watchlist_non_frontier(self, fake_pool): prober, index, _ = self._make_prober( - fake_pool, watchlist_nums=[1], hot_lookback_months=0, - frontier_window_above=0, frontier_window_below=0, + fake_pool, + watchlist_nums=[1], + hot_lookback_months=0, + frontier_window_above=0, + frontier_window_below=0, ) self._set_frontier(index, 100) # Number 50 is not watchlist and not in frontier range → "recent" @@ -605,9 +623,13 @@ def test_build_hot_list_explicit_ranges_update_frontier_range(self, fake_pool): def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): """Known hot numbers with latest_revision=None should start from R0.""" prober, index, _ = self._make_prober( - fake_pool, watchlist_nums=[9999], hot_lookback_months=0, - frontier_window_above=0, frontier_window_below=0, - hot_revision_depth=1, gap_max_rev=0, + fake_pool, + watchlist_nums=[9999], + hot_lookback_months=0, + frontier_window_above=0, + frontier_window_below=0, + hot_revision_depth=1, + gap_max_rev=0, ) # Add 9999 to _max_rev so it's "known" but with latest_revision=None index._max_rev = {9999: -1, 99: 0, 100: 0} @@ -622,9 +644,12 @@ def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): def test_cold_known_skips_when_latest_none(self, fake_pool): """cold_known paper with latest_revision=None should be silently skipped.""" prober, index, _ = self._make_prober( - fake_pool, hot_lookback_months=0, - frontier_window_above=0, frontier_window_below=0, # empty frontier range - cold_cycle_divisor=1, cold_revision_depth=1, + fake_pool, + hot_lookback_months=0, + frontier_window_above=0, + frontier_window_below=0, # empty frontier range + cold_cycle_divisor=1, + cold_revision_depth=1, ) # 4 has _max_rev=-1 → latest_revision=None; 5 is normal # With no frontier window and no watchlist, both are cold_known @@ -635,7 +660,7 @@ def test_cold_known_skips_when_latest_none(self, fake_pool): urls = prober._build_cold_slice(1, frontier, hot_known, hot_unknown) cold_nums = {r[3] for r in urls if r[1] == "cold"} assert 4 not in cold_nums # skipped because latest_revision=None - assert 5 in cold_nums # normally probed + assert 5 in cold_nums # normally probed async def test_probe_one_bad_last_modified_header(self, fake_pool): """An unparsable Last-Modified header should not crash; is_recent stays False.""" @@ -662,10 +687,12 @@ def test_cold_excludes_hot_numbers(self, fake_pool): frontier_window_above=0, frontier_window_below=0, ) - index._parse_and_index({ - "P5000R2": {"title": "T", "date": "2020-01-01", "type": "paper"}, - "P5001R0": {"title": "T", "date": "2020-01-01", "type": "paper"}, - }) + index._parse_and_index( + { + "P5000R2": {"title": "T", "date": "2020-01-01", "type": "paper"}, + "P5001R0": {"title": "T", "date": "2020-01-01", "type": "paper"}, + } + ) frontier = index.effective_frontier() hot_known, hot_unknown = prober._hot_numbers(frontier) urls = prober._build_cold_slice(1, frontier, hot_known, hot_unknown) @@ -675,6 +702,7 @@ def test_cold_excludes_hot_numbers(self, fake_pool): # ── ISOProber: _probe_one ───────────────────────────────────────────────────── + class TestISOProberProbeOne: def _make_prober(self, fake_pool) -> tuple[ISOProber, WG21Index, ProbeState]: index = WG21Index(fake_pool) @@ -824,6 +852,7 @@ async def test_stats_error(self, fake_pool): async def test_run_cycle_logs_unhandled_exception(self, fake_pool, caplog): """If asyncio.gather returns an Exception (not ProbeHit), it is logged.""" import logging + index = WG21Index(fake_pool) index._max_p = 100 index._max_rev = {99: 0, 100: 0} @@ -831,9 +860,12 @@ async def test_run_cycle_logs_unhandled_exception(self, fake_pool, caplog): state = ProbeState(fake_pool) cfg = make_test_settings( watchlist_papers=[9999], - hot_lookback_months=0, hot_revision_depth=1, - frontier_window_above=0, frontier_window_below=0, - gap_max_rev=0, cold_cycle_divisor=100, + hot_lookback_months=0, + hot_revision_depth=1, + frontier_window_above=0, + frontier_window_below=0, + gap_max_rev=0, + cold_cycle_divisor=100, ) prober = ISOProber(index, state, user_watchlist=_mock_wl([9999]), cfg=cfg) @@ -882,6 +914,7 @@ async def test_stats_reset_each_cycle(self, fake_pool): # ── ISOProber: run_cycle ────────────────────────────────────────────────────── + class TestISOProberRunCycle: async def test_run_cycle_records_hit_and_marks_discovered(self, fake_pool): index = WG21Index(fake_pool) diff --git a/tests/test_storage.py b/tests/test_storage.py index 8243df8..741a267 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,11 +1,10 @@ """Tests for paperscout.storage (PostgreSQL-backed via FakePool).""" + from __future__ import annotations import time from unittest.mock import patch -import pytest - from paperscout.models import Paper from paperscout.storage import ( PaperCache, @@ -13,11 +12,10 @@ UserWatchlist, iso_paper_number_from_discovered_url, ) -from tests.conftest import FakePool - # ── PaperCache ──────────────────────────────────────────────────────────────── + class TestPaperCache: def test_is_fresh_when_empty(self, fake_pool): cache = PaperCache(fake_pool, ttl_hours=1.0) @@ -77,6 +75,7 @@ def test_write_non_ascii(self, fake_pool): # ── ProbeState ──────────────────────────────────────────────────────────────── + class TestProbeState: def test_initial_state(self, fake_pool): state = ProbeState(fake_pool) @@ -105,12 +104,14 @@ def test_mark_discovered_stores_last_modified(self, fake_pool): assert entry["discovered_at"] > 0 def test_iso_paper_number_from_discovered_url(self): - assert iso_paper_number_from_discovered_url( - "https://isocpp.org/files/papers/D4165R0.pdf" - ) == 4165 - assert iso_paper_number_from_discovered_url( - "https://isocpp.org/files/papers/P1234R0.html" - ) == 1234 + assert ( + iso_paper_number_from_discovered_url("https://isocpp.org/files/papers/D4165R0.pdf") + == 4165 + ) + assert ( + iso_paper_number_from_discovered_url("https://isocpp.org/files/papers/P1234R0.html") + == 1234 + ) assert iso_paper_number_from_discovered_url("https://example.com/") is None def test_paper_nums_from_discovered_iso_urls(self, fake_pool): @@ -221,6 +222,7 @@ def test_miss_counts_property_returns_all(self, fake_pool): # ── UserWatchlist ───────────────────────────────────────────────────────────── + class TestUserWatchlist: def test_add_author_returns_true(self, fake_pool): wl = UserWatchlist(fake_pool) @@ -297,7 +299,6 @@ def test_get_all_watched_paper_nums_union(self, fake_pool): assert nums == {2300, 2301} def test_matches_for_users_author_match(self, fake_pool): - from paperscout.monitor import PerUserMatches wl = UserWatchlist(fake_pool) wl.add("U1", "niebler") paper = Paper(id="P2300R11", title="X", author="Eric Niebler") @@ -307,7 +308,6 @@ def test_matches_for_users_author_match(self, fake_pool): assert paper in matched_papers def test_matches_for_users_paper_match(self, fake_pool): - from paperscout.monitor import PerUserMatches wl = UserWatchlist(fake_pool) wl.add("U1", "2300") paper = Paper(id="P2300R11", title="X", author="Unknown") @@ -328,12 +328,18 @@ def test_matches_for_users_empty_watchlist(self, fake_pool): def test_matches_for_users_probe_hit_author(self, fake_pool): from paperscout.sources import ProbeHit + wl = UserWatchlist(fake_pool) wl.add("U1", "niebler") hit = ProbeHit( url="https://isocpp.org/files/papers/D9999R0.pdf", - prefix="D", number=9999, revision=0, extension=".pdf", - tier="frontier", front_text="written by niebler", is_recent=True, + prefix="D", + number=9999, + revision=0, + extension=".pdf", + tier="frontier", + front_text="written by niebler", + is_recent=True, ) result = wl.matches_for_users([], [hit]) assert "U1" in result @@ -341,12 +347,17 @@ def test_matches_for_users_probe_hit_author(self, fake_pool): def test_matches_for_users_probe_hit_paper_number(self, fake_pool): from paperscout.sources import ProbeHit + wl = UserWatchlist(fake_pool) wl.add("U1", "9999") hit = ProbeHit( url="https://isocpp.org/files/papers/D9999R0.pdf", - prefix="D", number=9999, revision=0, extension=".pdf", - tier="watchlist", is_recent=True, + prefix="D", + number=9999, + revision=0, + extension=".pdf", + tier="watchlist", + is_recent=True, ) result = wl.matches_for_users([], [hit]) assert "U1" in result From 54a655001a63eaa6a5b6c9267297ae071e94432e Mon Sep 17 00:00:00 2001 From: mac Date: Tue, 5 May 2026 23:31:23 +0800 Subject: [PATCH 2/4] addressed coderabbitai --- .github/CODEOWNERS | 6 +- .github/workflows/codeql.yml | 4 +- CONTRIBUTING.md | 6 +- docs/handoff.md | 26 +++---- docs/onboarding.md | 129 +++++++++++++++++------------------ src/paperscout/config.py | 2 +- 6 files changed, 86 insertions(+), 87 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 912c957..5f46f20 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,6 +3,6 @@ # Uncomment and set one or more GitHub usernames or @org/team-name entries. # Teams must exist in the org and have visibility to this repository. # -# Example: -# * @alice @bob -# /docs/ @alice +* `@will` `@henry0816191` +/docs/ `@will` `@henry0816191` +/src/ `@will` `@henry0816191` diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 585eb80..409ae4e 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -2,9 +2,9 @@ name: CodeQL on: push: - branches: [main] + branches: [main, develop] pull_request: - branches: [main] + branches: [main, develop] schedule: - cron: "0 12 * * 1" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5a43138..62d91c9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,9 +16,9 @@ Thank you for your interest in improving paperscout. This document describes how ## Workflow 1. **Fork** the repository (if you lack direct push access) and **clone** your fork. -2. Create a **feature branch** from `main` (this repo’s default branch; open PRs target `main`). +2. Create a **feature branch** from the active integration branch (currently `develop`; confirm repo default/protection rules before opening). 3. Make focused commits with clear messages. -4. Open a **pull request** against `main`. Use the PR template; link related issues when applicable. +4. Open a **pull request** against the designated target branch (`develop` or `main`, per current release flow). Use the PR template; link related issues when applicable. 5. Ensure **CI is green** (tests + coverage + lint hooks — see below). ### Code owners @@ -42,7 +42,7 @@ pip install -e ".[dev]" # or: make check ``` -CI runs the same coverage gate on Python **3.10**, **3.11**, and **3.12** (see `.github/workflows/ci.yml`). +CI runs `pre-commit run --all-files` for pushes/PRs on configured branches (currently `main` and `develop`; see `.github/workflows/ci.yml`). ### Lint and format (Ruff + pre-commit) diff --git a/docs/handoff.md b/docs/handoff.md index 13d0bcb..8330d1a 100644 --- a/docs/handoff.md +++ b/docs/handoff.md @@ -23,7 +23,7 @@ ISO probing uses **HTTP HEAD**, not GET, to detect existence and metadata withou When the wg21 index gains a **new P** row, the monitor checks whether a matching **D** URL was previously recorded in `discovered_urls`. If so, it emits a **D→P transition** for notification. -**Why:** The index alone does not tell you that *we* saw the draft first; probe history is the bridge. Logic lives in [`monitor.py`](../src/paperscout/monitor.py) (`DPTransition` / `poll_once`). +**Why:** The index alone does not tell you that _we_ saw the draft first; probe history is the bridge. Logic lives in [`monitor.py`](../src/paperscout/monitor.py) (`DPTransition` / `poll_once`). ### 4. Slack queue and HTTP 429 @@ -39,13 +39,13 @@ Outbound Slack messages go through a **background queue** (see [`scout.py`](../s ## Operational gotchas -| Topic | What to know | -| ----- | -------------- | -| **isocpp.org** | Third-party availability and latency directly affect cycle time; long cycles increase sleep spacing via `POLL_OVERRUN_COOLDOWN_SECONDS` (see [onboarding — Scheduling](onboarding.md#scheduling-asyncio-loop)). | -| **HEAD volume** | Typical **~1,600–2,000** HEAD requests per cycle at default settings (README architecture section). Tune `HTTP_CONCURRENCY` / windows if needed. | -| **Slack 429** | Expected under burst; queue backs off using response headers — do not remove the queue “to simplify” without a replacement strategy. | -| **Docker + Postgres** | Containers reach the host DB via `host.docker.internal`; Postgres must **listen** and **pg_hba** must allow the Docker bridge — [SERVER_SETUP.md](../deploy/SERVER_SETUP.md). | -| **Logs vs DB** | Rotating files under `DATA_DIR`; durable probe/index/watchlist state in **PostgreSQL** only. | +| Topic | What to know | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **isocpp.org** | Third-party availability and latency directly affect cycle time; long cycles increase sleep spacing via `POLL_OVERRUN_COOLDOWN_SECONDS` (see [onboarding — Scheduling](onboarding.md#scheduling-asyncio-loop)). | +| **HEAD volume** | Typical **~1,600–2,000** HEAD requests per cycle at default settings (README architecture section). Tune `HTTP_CONCURRENCY` / windows if needed. | +| **Slack 429** | Expected under burst; queue backs off using response headers — do not remove the queue “to simplify” without a replacement strategy. | +| **Docker + Postgres** | Containers reach the host DB via `host.docker.internal`; Postgres must **listen** and **pg_hba** must allow the Docker bridge — [SERVER_SETUP.md](../deploy/SERVER_SETUP.md). | +| **Logs vs DB** | Rotating files under `DATA_DIR`; durable probe/index/watchlist state in **PostgreSQL** only. | ## Open TODOs and deferred items @@ -54,11 +54,11 @@ Outbound Slack messages go through a **background queue** (see [`scout.py`](../s ## Credentials and contacts -| Secret / asset | Who to ask | -| -------------- | ---------- | -| **Slack app** (signing secret, bot token, app-level tokens if any) | Whoever administers the Slack workspace / app registration for this deployment (replace with a named role or mailing list). | -| **PostgreSQL** (production DSN, backup restore) | Infrastructure / DBA for the host that runs `DATABASE_URL` (see [SERVER_SETUP.md](../deploy/SERVER_SETUP.md), [db-backup workflow](../.github/workflows/db-backup.yml)). | -| **GitHub Environments** (SSH deploy, GCS backup credentials) | Documented in README **Deployment** and SERVER_SETUP **§9 GitHub secrets checklist**. | +| Secret / asset | Who to ask | +| ------------------------------------------------------------------ | ----------------------------------------------------------------------------------------- | +| **Slack app** (signing secret, bot token, app-level tokens if any) | `#paperscout-ops` (primary) / `ops-oncall@...` (backup), owned by . | +| **PostgreSQL** (production DSN, backup restore) | `dba-oncall@...` / `#db-operations`, owned by for the `DATABASE_URL` host. | +| **GitHub Environments** (SSH deploy, GCS backup credentials) | Documented in README **Deployment** and SERVER_SETUP **§9 GitHub secrets checklist**. | Replace generic phrases above with **CppAlliance**-specific contacts (Slack channel, email list, or on-call) as soon as they are approved for a public or internal wiki. diff --git a/docs/onboarding.md b/docs/onboarding.md index b662562..6c67d2b 100644 --- a/docs/onboarding.md +++ b/docs/onboarding.md @@ -7,22 +7,22 @@ This guide is ordered so a new developer can **run the test suite** and **start - **Python** 3.10, 3.11, or 3.12 (`requires-python` in [pyproject.toml](../pyproject.toml)) - **PostgreSQL** (local or remote) — the app stores all durable state in Postgres - **Git** -- Optional: **Docker** + Docker Compose for containerized runs (see [Deployment](#deployment)) +- Optional: **Docker** + Docker Compose for containerized runs (see [Deployment](`#deployment-summary`)) ## Repository layout -| Path | Role | -| ---- | ---- | -| [`src/paperscout/__init__.py`](../src/paperscout/__init__.py) | Package marker / version surface for the distribution. | -| [`src/paperscout/__main__.py`](../src/paperscout/__main__.py) | Entry point for `python -m paperscout`: logging, DB pool, Slack app, health server, async scheduler. | -| [`src/paperscout/config.py`](../src/paperscout/config.py) | Pydantic `Settings` — all configuration from environment / `.env`. | -| [`src/paperscout/models.py`](../src/paperscout/models.py) | `Paper` dataclass and enums for paper IDs, types, and file extensions. | -| [`src/paperscout/sources.py`](../src/paperscout/sources.py) | `WG21Index` (index fetch + cache), `ISOProber` (async HEAD probing of isocpp.org), open-std scraper hooks. | -| [`src/paperscout/monitor.py`](../src/paperscout/monitor.py) | `Scheduler`, index diffing, D→P transition detection, per-user watchlist match orchestration. | -| [`src/paperscout/scout.py`](../src/paperscout/scout.py) | Slack Bolt app, message queue, channel/DM notifications, command handlers. | -| [`src/paperscout/storage.py`](../src/paperscout/storage.py) | PostgreSQL-backed paper cache, probe state, and per-user watchlists. | -| [`src/paperscout/db.py`](../src/paperscout/db.py) | Connection pool setup and schema DDL. | -| [`src/paperscout/health.py`](../src/paperscout/health.py) | Small HTTP server exposing `GET /health` for orchestration and CD checks. | +| Path | Role | +| ------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | +| [`src/paperscout/__init__.py`](../src/paperscout/__init__.py) | Package marker / version surface for the distribution. | +| [`src/paperscout/__main__.py`](../src/paperscout/__main__.py) | Entry point for `python -m paperscout`: logging, DB pool, Slack app, health server, async scheduler. | +| [`src/paperscout/config.py`](../src/paperscout/config.py) | Pydantic `Settings` — all configuration from environment / `.env`. | +| [`src/paperscout/models.py`](../src/paperscout/models.py) | `Paper` dataclass and enums for paper IDs, types, and file extensions. | +| [`src/paperscout/sources.py`](../src/paperscout/sources.py) | `WG21Index` (index fetch + cache), `ISOProber` (async HEAD probing of isocpp.org), open-std scraper hooks. | +| [`src/paperscout/monitor.py`](../src/paperscout/monitor.py) | `Scheduler`, index diffing, D→P transition detection, per-user watchlist match orchestration. | +| [`src/paperscout/scout.py`](../src/paperscout/scout.py) | Slack Bolt app, message queue, channel/DM notifications, command handlers. | +| [`src/paperscout/storage.py`](../src/paperscout/storage.py) | PostgreSQL-backed paper cache, probe state, and per-user watchlists. | +| [`src/paperscout/db.py`](../src/paperscout/db.py) | Connection pool setup and schema DDL. | +| [`src/paperscout/health.py`](../src/paperscout/health.py) | Small HTTP server exposing `GET /health` for orchestration and CD checks. | Supporting directories: [`tests/`](../tests/) (pytest), [`deploy/`](../deploy/) (nginx sample + server provisioning), [`.github/workflows/`](../.github/workflows/) (CI/CD). @@ -113,7 +113,6 @@ Production-style flow: ``` 3. Health check from the host (see [docker-compose.yml](../docker-compose.yml) port mappings): - - App (Slack): `127.0.0.1:9100` → container `3000` - Health: `127.0.0.1:9101` → container `8080` → e.g. `curl -sf http://127.0.0.1:9101/health` @@ -129,82 +128,82 @@ Every key from [`.env.example`](../.env.example) is listed below. Names in `.env ### Slack and server -| Variable | Required | Default / example | Meaning | -| -------- | -------- | ----------------- | ------- | -| `SLACK_SIGNING_SECRET` | Yes (for Slack) | — | Slack app signing secret; verifies incoming requests. | -| `SLACK_BOT_TOKEN` | Yes (for Slack) | — | Bot User OAuth token (`xoxb-…`). | -| `PORT` | No | `3000` | Port for the Slack Bolt HTTP listener. | +| Variable | Required | Default / example | Meaning | +| ---------------------- | --------------- | ----------------- | ----------------------------------------------------- | +| `SLACK_SIGNING_SECRET` | Yes (for Slack) | — | Slack app signing secret; verifies incoming requests. | +| `SLACK_BOT_TOKEN` | Yes (for Slack) | — | Bot User OAuth token (`xoxb-…`). | +| `PORT` | No | `3000` | Port for the Slack Bolt HTTP listener. | ### Database -| Variable | Required | Meaning | -| -------- | -------- | ------- | -| `DATABASE_URL` | Yes | PostgreSQL DSN, e.g. `postgresql://user:pass@host:5432/paperscout`. In Docker against host Postgres, `host.docker.internal` is typical (see `.env.example`). | +| Variable | Required | Meaning | +| -------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `DATABASE_URL` | Yes | PostgreSQL DSN, e.g. `postgresql://user:pass@host:5432/paperscout`. In Docker against host Postgres, `host.docker.internal` is typical (see `.env.example`). | ### Scheduling and sources -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `POLL_INTERVAL_MINUTES` | `30` | Target wall-clock spacing between poll cycles (see [Scheduling](#scheduling-asyncio-loop) below). | -| `POLL_OVERRUN_COOLDOWN_SECONDS` | `300` | **Minimum** sleep after any cycle that ran longer than one interval — avoids hammering the network if a cycle overruns. | -| `ENABLE_BULK_WG21` | `true` | Fetch and parse wg21.link index each cycle when enabled. | -| `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org bulk fetch (not yet wired into the scheduler). | -| `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle when enabled. | +| Variable | Default | Meaning | +| ------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------- | +| `POLL_INTERVAL_MINUTES` | `30` | Target wall-clock spacing between poll cycles (see [Scheduling](#scheduling-asyncio-loop) below). | +| `POLL_OVERRUN_COOLDOWN_SECONDS` | `300` | **Minimum** sleep after any cycle that ran longer than one interval — avoids hammering the network if a cycle overruns. | +| `ENABLE_BULK_WG21` | `true` | Fetch and parse wg21.link index each cycle when enabled. | +| `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org bulk fetch (not yet wired into the scheduler). | +| `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle when enabled. | ### Probe prefixes / extensions -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `PROBE_PREFIXES` | `["D","P"]` | JSON list of URL prefixes for gap / unknown numbers. | -| `PROBE_EXTENSIONS` | `[".pdf",".html"]` | JSON list of file extensions to probe. | +| Variable | Default | Meaning | +| ------------------ | ------------------ | ---------------------------------------------------- | +| `PROBE_PREFIXES` | `["D","P"]` | JSON list of URL prefixes for gap / unknown numbers. | +| `PROBE_EXTENSIONS` | `[".pdf",".html"]` | JSON list of file extensions to probe. | ### Frontier -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `FRONTIER_WINDOW_ABOVE` | `60` | How many P-numbers above the effective frontier to treat as hot each cycle. | -| `FRONTIER_WINDOW_BELOW` | `30` | How many below the frontier window. | -| `FRONTIER_EXPLICIT_RANGES` | `[]` | JSON list of `{"min": n, "max": m}` extra hot ranges. | -| `FRONTIER_GAP_THRESHOLD` | `50` | Max gap between consecutive P-numbers before a number is treated as an outlier for frontier calculation. | +| Variable | Default | Meaning | +| -------------------------- | ------- | -------------------------------------------------------------------------------------------------------- | +| `FRONTIER_WINDOW_ABOVE` | `60` | How many P-numbers above the effective frontier to treat as hot each cycle. | +| `FRONTIER_WINDOW_BELOW` | `30` | How many below the frontier window. | +| `FRONTIER_EXPLICIT_RANGES` | `[]` | JSON list of `{"min": n, "max": m}` extra hot ranges. | +| `FRONTIER_GAP_THRESHOLD` | `50` | Max gap between consecutive P-numbers before a number is treated as an outlier for frontier calculation. | ### Hot / cold probing -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `HOT_LOOKBACK_MONTHS` | `6` | Papers with index dates in this window are probed every cycle (hot). | -| `HOT_REVISION_DEPTH` | `2` | Extra revision indices ahead of the known latest for hot numbers. | -| `COLD_REVISION_DEPTH` | `1` | Revisions ahead of known latest for cold pool. | -| `COLD_CYCLE_DIVISOR` | `48` | Cold pool split into this many slices; one slice per cycle (48×30 min ≈ 24 h full sweep). | -| `GAP_MAX_REV` | `1` | For gap/unknown numbers, probe revisions `R0` … `R` this value. | +| Variable | Default | Meaning | +| --------------------- | ------- | ----------------------------------------------------------------------------------------- | +| `HOT_LOOKBACK_MONTHS` | `6` | Papers with index dates in this window are probed every cycle (hot). | +| `HOT_REVISION_DEPTH` | `2` | Extra revision indices ahead of the known latest for hot numbers. | +| `COLD_REVISION_DEPTH` | `1` | Revisions ahead of known latest for cold pool. | +| `COLD_CYCLE_DIVISOR` | `48` | Cold pool split into this many slices; one slice per cycle (48×30 min ≈ 24 h full sweep). | +| `GAP_MAX_REV` | `1` | For gap/unknown numbers, probe revisions `R0` … `R` this value. | ### Alerting and HTTP client -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `ALERT_MODIFIED_HOURS` | `24` | Only Slack-notify probe hits whose `Last-Modified` is within this many hours (see README). | -| `HTTP_CONCURRENCY` | `20` | Max concurrent async HTTP requests for probing. | -| `HTTP_TIMEOUT_SECONDS` | `10` | Per-request timeout. | -| `HTTP_USE_HTTP2` | `true` | Use HTTP/2 where supported. | +| Variable | Default | Meaning | +| ---------------------- | ------- | ------------------------------------------------------------------------------------------ | +| `ALERT_MODIFIED_HOURS` | `24` | Only Slack-notify probe hits whose `Last-Modified` is within this many hours (see README). | +| `HTTP_CONCURRENCY` | `20` | Max concurrent async HTTP requests for probing. | +| `HTTP_TIMEOUT_SECONDS` | `10` | Per-request timeout. | +| `HTTP_USE_HTTP2` | `true` | Use HTTP/2 where supported. | ### Notifications -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `NOTIFICATION_CHANNEL` | empty | Slack channel ID for shared alerts (frontier, D→P, etc.); empty disables channel posts. | -| `NOTIFY_ON_FRONTIER_HIT` | `true` | Notify on recent draft hits near the frontier. | -| `NOTIFY_ON_ANY_DRAFT` | `true` | Notify on other recent draft hits. | -| `NOTIFY_ON_DP_TRANSITION` | `true` | Notify when a tracked D URL’s paper appears as P in the index. | +| Variable | Default | Meaning | +| ------------------------- | ------- | --------------------------------------------------------------------------------------- | +| `NOTIFICATION_CHANNEL` | empty | Slack channel ID for shared alerts (frontier, D→P, etc.); empty disables channel posts. | +| `NOTIFY_ON_FRONTIER_HIT` | `true` | Notify on recent draft hits near the frontier. | +| `NOTIFY_ON_ANY_DRAFT` | `true` | Notify on other recent draft hits. | +| `NOTIFY_ON_DP_TRANSITION` | `true` | Notify when a tracked D URL’s paper appears as P in the index. | ### Storage and logging -| Variable | Default | Meaning | -| -------- | ------- | ------- | -| `DATA_DIR` | `./data` | Log directory (and local file layout); created if missing. | -| `CACHE_TTL_HOURS` | `1` | Staleness window for cached wg21 index blob in Postgres. | -| `LOG_LEVEL` | `INFO` | Console/file log level (`DEBUG`, `INFO`, `WARNING`, `ERROR`). | -| `LOG_RETENTION_DAYS` | `7` | Days of rotated log files to retain. | +| Variable | Default | Meaning | +| -------------------- | -------- | ------------------------------------------------------------- | +| `DATA_DIR` | `./data` | Log directory (and local file layout); created if missing. | +| `CACHE_TTL_HOURS` | `1` | Staleness window for cached wg21 index blob in Postgres. | +| `LOG_LEVEL` | `INFO` | Console/file log level (`DEBUG`, `INFO`, `WARNING`, `ERROR`). | +| `LOG_RETENTION_DAYS` | `7` | Days of rotated log files to retain. | -**Note:** `health_port` (default `8080`) exists in [Settings](../src/paperscout/config.py) but is not in `.env.example`; set `HEALTH_PORT` only if you add it to settings or extend `.env.example` in the future. +**Note:** `health_port` (default `8080`) exists in [Settings](../src/paperscout/config.py) but is not listed in `.env.example`. You can still set `HEALTH_PORT` in `.env` if you need to override the default. ## Scheduling (asyncio loop) diff --git a/src/paperscout/config.py b/src/paperscout/config.py index f5b3f47..23c0dd3 100644 --- a/src/paperscout/config.py +++ b/src/paperscout/config.py @@ -23,7 +23,7 @@ class Settings(BaseSettings): poll_interval_minutes: int = 30 # Minimum seconds to sleep after an overrun cycle (poll took longer than # poll_interval_minutes). Acts as a short cooldown before the next cycle. - poll_overrun_cooldown_seconds: int = 300 # 5 min + poll_overrun_cooldown_seconds: int = Field(default=300, ge=1) # 5 min enable_bulk_wg21: bool = True enable_bulk_openstd: bool = True enable_iso_probe: bool = True From 7fba7d2e82bcbd443718c412aa6d61f678bcad30 Mon Sep 17 00:00:00 2001 From: mac Date: Tue, 5 May 2026 23:40:14 +0800 Subject: [PATCH 3/4] fixed lint errors --- .github/CODEOWNERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5f46f20..84e6ed6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -4,5 +4,5 @@ # Teams must exist in the org and have visibility to this repository. # * `@will` `@henry0816191` -/docs/ `@will` `@henry0816191` -/src/ `@will` `@henry0816191` +/docs/ `@will` `@henry0816191` +/src/ `@will` `@henry0816191` From 5dbd6750c7a70b6206465a70ffeaafd05d0090b5 Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 6 May 2026 02:59:34 +0800 Subject: [PATCH 4/4] addressed Will's reviews --- .github/CODEOWNERS | 4 +-- CODE_OF_CONDUCT.md | 77 ---------------------------------------------- CONTRIBUTING.md | 5 --- docs/handoff.md | 10 ------ 4 files changed, 1 insertion(+), 95 deletions(-) delete mode 100644 CODE_OF_CONDUCT.md diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 84e6ed6..30355e1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,6 +3,4 @@ # Uncomment and set one or more GitHub usernames or @org/team-name entries. # Teams must exist in the org and have visibility to this repository. # -* `@will` `@henry0816191` -/docs/ `@will` `@henry0816191` -/src/ `@will` `@henry0816191` +* @wpak-ai diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index ea5a2d0..0000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,77 +0,0 @@ -# Contributor Covenant Code of Conduct - -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our community include: - -- Demonstrating empathy and kindness toward other people -- Being respectful of differing opinions, viewpoints, and experiences -- Giving and gracefully accepting constructive feedback -- Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience -- Focusing on what is best not just for us as individuals, but for the overall community - -Examples of unacceptable behavior include: - -- The use of sexualized language or imagery, and sexual attention or advances of any kind -- Trolling, insulting or derogatory comments, and personal or political attacks -- Public or private harassment -- Publishing others' private information, such as a physical or email address, without their explicit permission -- Other conduct which could reasonably be considered inappropriate in a professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the **repository maintainers** (see [GitHub organization](https://github.com/cppalliance) or use [GitHub Report abuse](https://github.com/contact/report-abuse) for content hosted on GitHub). Replace this paragraph with a dedicated conduct email or ticketing process when your organization publishes one. - -All community leaders are obligated to respect the privacy and security of the reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series of actions. - -**Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within the community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). - -Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). - -For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at [https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 62d91c9..62366c4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,11 +2,6 @@ Thank you for your interest in improving paperscout. This document describes how we work, how to run checks locally, and how releases are cut. -## Community standards - -- Please read the [Code of Conduct](CODE_OF_CONDUCT.md). Participation is governed by the **Contributor Covenant 2.1**. -- Report **security vulnerabilities** privately — see [SECURITY.md](SECURITY.md). Do not open public issues for undisclosed security problems. - ## Where to start - **[docs/onboarding.md](docs/onboarding.md)** — clone, database, `.env`, tests, and running the app locally. diff --git a/docs/handoff.md b/docs/handoff.md index 8330d1a..100c92e 100644 --- a/docs/handoff.md +++ b/docs/handoff.md @@ -52,16 +52,6 @@ Outbound Slack messages go through a **background queue** (see [`scout.py`](../s - **`ENABLE_BULK_OPENSTD` / open-std.org** — Code paths exist in [`sources.py`](../src/paperscout/sources.py); bulk open-std scheduling is **not** integrated into the main poll loop yet (README notes “not yet scheduled”). - **Eval / roadmap items** — If your org keeps a separate eval or ticket backlog, link it here; this repo does not ship a frozen “eval” document. -## Credentials and contacts - -| Secret / asset | Who to ask | -| ------------------------------------------------------------------ | ----------------------------------------------------------------------------------------- | -| **Slack app** (signing secret, bot token, app-level tokens if any) | `#paperscout-ops` (primary) / `ops-oncall@...` (backup), owned by . | -| **PostgreSQL** (production DSN, backup restore) | `dba-oncall@...` / `#db-operations`, owned by for the `DATABASE_URL` host. | -| **GitHub Environments** (SSH deploy, GCS backup credentials) | Documented in README **Deployment** and SERVER_SETUP **§9 GitHub secrets checklist**. | - -Replace generic phrases above with **CppAlliance**-specific contacts (Slack channel, email list, or on-call) as soon as they are approved for a public or internal wiki. - ## Related documents - [onboarding.md](onboarding.md) — linear setup for developers