diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..5fed995 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,15 @@ +# Default: normalize line endings in the repo checkout. +* text=auto + +# Unix-style scripts and CI configs +*.sh text eol=lf +.github/**/*.yml text eol=lf + +# Python sources +*.py text eol=lf + +# Markdown and docs +*.md text eol=lf + +# Windows batch files (if any) +*.bat text eol=crlf diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..30355e1 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,6 @@ +# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners +# +# Uncomment and set one or more GitHub usernames or @org/team-name entries. +# Teams must exist in the org and have visibility to this repository. +# +* @wpak-ai diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..212df45 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,59 @@ +name: Bug report +description: File a bug report to help us improve paperscout +labels: ["bug"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report. + **Do not** use this form for security vulnerabilities — follow **[SECURITY.md](../SECURITY.md)** (private reporting). + + - type: textarea + id: summary + attributes: + label: Summary + description: What went wrong, in one short paragraph? + validations: + required: true + + - type: textarea + id: repro + attributes: + label: Steps to reproduce + description: How can a maintainer reproduce the issue? + placeholder: | + 1. … + 2. … + 3. … + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behavior + validations: + required: true + + - type: textarea + id: actual + attributes: + label: Actual behavior + validations: + required: true + + - type: input + id: version + attributes: + label: Version / commit + description: Git SHA or release tag, if known + validations: + required: false + + - type: textarea + id: env + attributes: + label: Environment + description: OS, Python version, Docker vs bare metal, etc. + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..64eb98d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,2 @@ +blank_issues_enabled: true +contact_links: [] diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..32e1aee --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,34 @@ +name: Feature request +description: Suggest an idea for paperscout +labels: ["enhancement"] +body: + - type: textarea + id: problem + attributes: + label: Problem / motivation + description: What user or operator problem does this solve? + validations: + required: true + + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: What would you like to see happen? + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + validations: + required: false + + - type: textarea + id: context + attributes: + label: Additional context + description: Links, screenshots, Slack behavior, etc. + validations: + required: false diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..8be1ecc --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,13 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 10 + + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 10 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..1e7519f --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,12 @@ +## Summary + + + +## Test plan + +- [ ] `./run check` (or `make check`) +- [ ] `pre-commit run --all-files` + +## Related issues + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b3dfe52..85d5d21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,29 @@ on: branches: [main, develop] jobs: + lint: + name: Lint (pre-commit) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Cache pre-commit environments + uses: actions/cache@v4 + with: + path: ~/.cache/pre-commit + key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }} + + - name: Run pre-commit + run: pre-commit run --all-files + test: name: Test (Python ${{ matrix.python-version }}) runs-on: ubuntu-latest diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..409ae4e --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,33 @@ +name: CodeQL + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + schedule: + - cron: "0 12 * * 1" + +jobs: + analyze: + name: Analyze (Python) + runs-on: ubuntu-latest + permissions: + security-events: write + actions: read + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: python + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9da40d6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +# See https://pre-commit.com +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + args: ["--maxkb=800"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.6 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format diff --git a/.vscode/settings.json b/.vscode/settings.json index 7e68766..af690fc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { "python-envs.pythonProjects": [] -} \ No newline at end of file +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3afa3ec --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,18 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Open-source hygiene: contributing guide, security policy, code of conduct, onboarding and handoff docs, pre-commit (Ruff), GitHub issue templates, Dependabot, CodeQL, CODEOWNERS template, and `.gitattributes`. + +## [0.1.0] - 2026-05-05 + +### Added + +- Initial public release as tracked in `pyproject.toml` (WG21 paper tracking, Slack integration, PostgreSQL storage, Docker deploy, CI/CD workflows). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..62366c4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,72 @@ +# Contributing to paperscout + +Thank you for your interest in improving paperscout. This document describes how we work, how to run checks locally, and how releases are cut. + +## Where to start + +- **[docs/onboarding.md](docs/onboarding.md)** — clone, database, `.env`, tests, and running the app locally. +- **[docs/handoff.md](docs/handoff.md)** — maintainer-oriented design notes and operational gotchas. +- **[README.md](README.md)** — product behavior, Slack setup, deployment, and environment variable tables. + +## Workflow + +1. **Fork** the repository (if you lack direct push access) and **clone** your fork. +2. Create a **feature branch** from the active integration branch (currently `develop`; confirm repo default/protection rules before opening). +3. Make focused commits with clear messages. +4. Open a **pull request** against the designated target branch (`develop` or `main`, per current release flow). Use the PR template; link related issues when applicable. +5. Ensure **CI is green** (tests + coverage + lint hooks — see below). + +### Code owners + +[`.github/CODEOWNERS`](.github/CODEOWNERS) routes review requests. If GitHub reports unknown owners, maintainers should update that file with real `@username` or `@org/team` entries and ensure the team exists and has repository access. + +## Local checks + +Install the package in editable mode with dev dependencies: + +```bash +python -m venv .venv +source .venv/bin/activate # Windows Git Bash: source .venv/Scripts/activate +pip install -e ".[dev]" +``` + +### Tests and coverage + +```bash +./run check # pytest + coverage, fails under 90% line coverage (matches CI) +# or: make check +``` + +CI runs `pre-commit run --all-files` for pushes/PRs on configured branches (currently `main` and `develop`; see `.github/workflows/ci.yml`). + +### Lint and format (Ruff + pre-commit) + +We use **[pre-commit](https://pre-commit.com/)** with **[Ruff](https://docs.astral.sh/ruff/)** for linting and formatting. + +```bash +pre-commit install +pre-commit run --all-files +``` + +CI runs `pre-commit run --all-files` on every push and pull request (see the `lint` job in `.github/workflows/ci.yml`). + +## Expectations for changes + +- **Tests** — Add or update tests for behavior changes. Keep coverage at or above the project floor (**90%**). +- **Docs** — Update README, onboarding, or handoff when you change operator-visible behavior, env vars, or deployment steps. +- **Style** — Let Ruff format the tree; avoid unrelated drive-by reformatting of untouched files in the same PR when possible. + +## Releases + +We follow **[Semantic Versioning](https://semver.org/)** and **[Keep a Changelog](https://keepachangelog.com/)** principles. + +1. **Version** — Bump `version` in [`pyproject.toml`](pyproject.toml) (e.g. `0.1.0` → `0.2.0`). +2. **Changelog** — Move items from `## [Unreleased]` to a new section `## [x.y.z] - YYYY-MM-DD` in [`CHANGELOG.md`](CHANGELOG.md). +3. **Tag** — Create an annotated tag: `git tag -a v0.2.0 -m "Release v0.2.0"` and push it: `git push origin v0.2.0`. +4. **GitHub Release** — On GitHub, create a **Release** from that tag and paste the changelog section for that version into the release notes. + +Optional follow-ups (not required today): PyPI publishing workflow, signed tags, or automated release notes. + +## Questions + +Use **GitHub Issues** for bugs and feature ideas (see issue templates). For organizational or access questions, contact the **CppAlliance** maintainers responsible for this repository (replace with a concrete contact when publishing internally). diff --git a/README.md b/README.md index 37ad1dc..bc23c03 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,10 @@ WG21 C++ paper tracker with ISO draft probing and Slack notifications. A Python project that probes the isocpp.org paper system for unpublished D-paper drafts, monitors for new paper assignments at the frontier, and notifies a Slack channel when watched authors publish. +**Docs:** [Developer onboarding](docs/onboarding.md) (clone → DB → tests → run) · [Maintainer handoff](docs/handoff.md) · [Contributing](CONTRIBUTING.md) · [Changelog](CHANGELOG.md) · [Security](SECURITY.md) · [Code of conduct](CODE_OF_CONDUCT.md) + +If you only need to run tests or a local instance, start with **onboarding** before the Slack app sections below. + ## Features - **Per-user watchlists** -- each user manages their own list of authors and paper numbers via DM; the scout sends a personal DM when a match is found @@ -226,6 +230,7 @@ All parameters are configurable via environment variables or a `.env` file. See | Variable | Default | Description | | ----------------------- | ------- | ------------------------------------------------------ | | `POLL_INTERVAL_MINUTES` | `30` | Main polling cycle interval | +| `POLL_OVERRUN_COOLDOWN_SECONDS` | `300` | Minimum sleep after a poll cycle that overran the interval (avoids tight loops when work or errors stretch a cycle) | | `ENABLE_BULK_WG21` | `true` | Fetch wg21.link/index.json each cycle | | `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org scraping (not yet scheduled) | | `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle | diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..78c84c7 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,25 @@ +# Security policy + +## Supported versions + +| Version | Supported | +| ------- | ------------------ | +| `main` | Yes — latest fixes | +| Tags | Best-effort — use the latest tag for production | + +Older branches may not receive backports unless agreed with maintainers. + +## Reporting a vulnerability + +**Please do not** file a public GitHub issue for undisclosed security vulnerabilities. + +Instead: + +1. Use **[GitHub private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability)** for this repository (if enabled by org settings), **or** +2. Email or contact the **CppAlliance / repository maintainers** through an internal channel your organization documents for security. + +Maintainers will acknowledge receipt as soon as practical, investigate, and coordinate a fix and disclosure timeline with you. + +## Scope + +This policy covers the **paperscout** application code, Docker image, and GitHub workflows in this repository. Infrastructure (servers, PostgreSQL host hardening, Slack workspace policy) is out of scope here but should follow your organization’s security baseline — see [`deploy/SERVER_SETUP.md`](deploy/SERVER_SETUP.md) for deployment hardening notes. diff --git a/docs/handoff.md b/docs/handoff.md new file mode 100644 index 0000000..100c92e --- /dev/null +++ b/docs/handoff.md @@ -0,0 +1,59 @@ +# Maintainer handoff — paperscout + +This document captures **design intent**, **operational gotchas**, and **deferred work** so a second maintainer can operate and extend the service without tribal knowledge. For step-by-step setup, see [onboarding.md](onboarding.md) and the [README](../README.md). + +## Non-obvious design decisions + +### 1. Two-frequency hot vs cold ISO probing + +Every poll cycle could probe thousands of `isocpp.org` URLs. The prober splits P-numbers into: + +- **Hot** — frontier band, watchlist numbers, and papers with recent index dates: probed **every** cycle so new D-drafts near the action surface quickly. +- **Cold** — the long tail: each number is visited on a **rotating slice** (`COLD_CYCLE_DIVISOR` cycles ≈ one full pass per day by default). + +**Why:** Full HEAD sweep every 30 minutes would be noisy for operators and rough on isocpp.org; hot/cold keeps latency low where it matters while retaining eventual full coverage. See [README — Two-Frequency Probing Strategy](../README.md#two-frequency-probing-strategy). + +### 2. HEAD-only probes and Last-Modified gating + +ISO probing uses **HTTP HEAD**, not GET, to detect existence and metadata without downloading PDF/HTML bodies. + +**Why HEAD:** Drafts can be large; bandwidth and server load stay bounded. Alerts use the **`Last-Modified`** header so old files discovered for the first time do not spam Slack; missing header is treated as “recent” (first discovery). Implemented in [`ISOProber`](../src/paperscout/sources.py) and summarized in [README — Alerting by Last-Modified](../README.md#alerting-by-last-modified). + +### 3. D→P transition detection via stored probe state + +When the wg21 index gains a **new P** row, the monitor checks whether a matching **D** URL was previously recorded in `discovered_urls`. If so, it emits a **D→P transition** for notification. + +**Why:** The index alone does not tell you that _we_ saw the draft first; probe history is the bridge. Logic lives in [`monitor.py`](../src/paperscout/monitor.py) (`DPTransition` / `poll_once`). + +### 4. Slack queue and HTTP 429 + +Outbound Slack messages go through a **background queue** (see [`scout.py`](../src/paperscout/scout.py)) so bursts from one poll do not violate Slack posting limits. The queue respects **HTTP 429** and `Retry-After`. + +**Why:** Bolt handlers must stay responsive; rate limits are easier to reason about in one place than ad hoc sleeps in notifiers. + +### 5. Watchlist DB work off the event loop + +`poll_once` uses `asyncio.to_thread` for `user_watchlist.matches_for_users` because that path uses **synchronous psycopg2** I/O. + +**Why:** Avoid blocking the asyncio loop during PostgreSQL-heavy match resolution while keeping a single-threaded pool model elsewhere. + +## Operational gotchas + +| Topic | What to know | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **isocpp.org** | Third-party availability and latency directly affect cycle time; long cycles increase sleep spacing via `POLL_OVERRUN_COOLDOWN_SECONDS` (see [onboarding — Scheduling](onboarding.md#scheduling-asyncio-loop)). | +| **HEAD volume** | Typical **~1,600–2,000** HEAD requests per cycle at default settings (README architecture section). Tune `HTTP_CONCURRENCY` / windows if needed. | +| **Slack 429** | Expected under burst; queue backs off using response headers — do not remove the queue “to simplify” without a replacement strategy. | +| **Docker + Postgres** | Containers reach the host DB via `host.docker.internal`; Postgres must **listen** and **pg_hba** must allow the Docker bridge — [SERVER_SETUP.md](../deploy/SERVER_SETUP.md). | +| **Logs vs DB** | Rotating files under `DATA_DIR`; durable probe/index/watchlist state in **PostgreSQL** only. | + +## Open TODOs and deferred items + +- **`ENABLE_BULK_OPENSTD` / open-std.org** — Code paths exist in [`sources.py`](../src/paperscout/sources.py); bulk open-std scheduling is **not** integrated into the main poll loop yet (README notes “not yet scheduled”). +- **Eval / roadmap items** — If your org keeps a separate eval or ticket backlog, link it here; this repo does not ship a frozen “eval” document. + +## Related documents + +- [onboarding.md](onboarding.md) — linear setup for developers +- [CONTRIBUTING.md](../CONTRIBUTING.md) — PRs, hooks, releases +- [SECURITY.md](../SECURITY.md) — vulnerability reporting diff --git a/docs/onboarding.md b/docs/onboarding.md new file mode 100644 index 0000000..6c67d2b --- /dev/null +++ b/docs/onboarding.md @@ -0,0 +1,226 @@ +# Onboarding — paperscout + +This guide is ordered so a new developer can **run the test suite** and **start the service locally** without reading the whole [README](../README.md) first. For Slack app setup, production deploy, and deep architecture, follow links from each section. + +## Prerequisites + +- **Python** 3.10, 3.11, or 3.12 (`requires-python` in [pyproject.toml](../pyproject.toml)) +- **PostgreSQL** (local or remote) — the app stores all durable state in Postgres +- **Git** +- Optional: **Docker** + Docker Compose for containerized runs (see [Deployment](`#deployment-summary`)) + +## Repository layout + +| Path | Role | +| ------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | +| [`src/paperscout/__init__.py`](../src/paperscout/__init__.py) | Package marker / version surface for the distribution. | +| [`src/paperscout/__main__.py`](../src/paperscout/__main__.py) | Entry point for `python -m paperscout`: logging, DB pool, Slack app, health server, async scheduler. | +| [`src/paperscout/config.py`](../src/paperscout/config.py) | Pydantic `Settings` — all configuration from environment / `.env`. | +| [`src/paperscout/models.py`](../src/paperscout/models.py) | `Paper` dataclass and enums for paper IDs, types, and file extensions. | +| [`src/paperscout/sources.py`](../src/paperscout/sources.py) | `WG21Index` (index fetch + cache), `ISOProber` (async HEAD probing of isocpp.org), open-std scraper hooks. | +| [`src/paperscout/monitor.py`](../src/paperscout/monitor.py) | `Scheduler`, index diffing, D→P transition detection, per-user watchlist match orchestration. | +| [`src/paperscout/scout.py`](../src/paperscout/scout.py) | Slack Bolt app, message queue, channel/DM notifications, command handlers. | +| [`src/paperscout/storage.py`](../src/paperscout/storage.py) | PostgreSQL-backed paper cache, probe state, and per-user watchlists. | +| [`src/paperscout/db.py`](../src/paperscout/db.py) | Connection pool setup and schema DDL. | +| [`src/paperscout/health.py`](../src/paperscout/health.py) | Small HTTP server exposing `GET /health` for orchestration and CD checks. | + +Supporting directories: [`tests/`](../tests/) (pytest), [`deploy/`](../deploy/) (nginx sample + server provisioning), [`.github/workflows/`](../.github/workflows/) (CI/CD). + +## Local development + +### 1. Clone and virtual environment + +```bash +git clone https://github.com/cppalliance/paperscout-python.git +cd paperscout-python +python -m venv .venv +source .venv/bin/activate # Windows Git Bash: source .venv/Scripts/activate +pip install -e ".[dev]" +``` + +### 2. PostgreSQL + +Create a database and user (example names; adjust as needed): + +```sql +CREATE USER paperscout WITH PASSWORD 'your-secret'; +CREATE DATABASE paperscout OWNER paperscout; +``` + +Full host provisioning (listen addresses, `pg_hba.conf`, Docker bridge) is in [deploy/SERVER_SETUP.md](../deploy/SERVER_SETUP.md) (especially §3 and “Allow Docker containers to connect”). + +### 3. Environment file + +```bash +cp .env.example .env +``` + +**Minimum to run the process** (Slack endpoints will not work until tokens and signing secret are set): + +- `DATABASE_URL` — e.g. `postgresql://paperscout:your-secret@localhost:5432/paperscout` +- For Slack: `SLACK_SIGNING_SECRET`, `SLACK_BOT_TOKEN` — see [README § Slack App Setup](../README.md#slack-app-setup) + +### 4. Run tests + +Fast run (no coverage gate): + +```bash +./run test +# or: make test +# or: python -m pytest tests/ -v +``` + +Same checks as CI, including the **90% coverage** floor: + +```bash +./run check +# or: make check +``` + +CI configuration: [.github/workflows/ci.yml](../.github/workflows/ci.yml) (Python 3.10–3.12 on `ubuntu-latest`, `main` only). + +### 5. Code quality hooks (recommended) + +Install [pre-commit](https://pre-commit.com/) hooks after `pip install -e ".[dev]"`: + +```bash +pre-commit install +pre-commit run --all-files +``` + +See [CONTRIBUTING.md](../CONTRIBUTING.md) for PR expectations. + +## Run the service locally + +```bash +python -m paperscout +``` + +- **Slack HTTP app** listens on `PORT` (default **3000**). +- **Health** endpoint listens on `health_port` from settings (default **8080**) — `GET /health`. + +For Slack Event Subscriptions you need a public URL (e.g. ngrok); see [README](../README.md#7-set-the-request-url). + +## Deployment (summary) + +Production-style flow: + +1. Configure `.env` on the server (or use `env_file` in Compose). +2. Build and start: + + ```bash + docker compose up -d --build + ``` + +3. Health check from the host (see [docker-compose.yml](../docker-compose.yml) port mappings): + - App (Slack): `127.0.0.1:9100` → container `3000` + - Health: `127.0.0.1:9101` → container `8080` → e.g. `curl -sf http://127.0.0.1:9101/health` + +Full CD narrative, GitHub Environments, and branch mapping: [README — Deployment](../README.md#deployment). + +## nginx + +Use [deploy/paperscout.conf](../deploy/paperscout.conf) as a reference for TLS termination and proxying `443` → app `3000`, with `/health` routed to the health port. Step-by-step integration with an existing site is in [deploy/SERVER_SETUP.md](../deploy/SERVER_SETUP.md) (§4 nginx + TLS). + +## Environment variables (complete reference) + +Every key from [`.env.example`](../.env.example) is listed below. Names in `.env` use **SCREAMING_SNAKE_CASE**; the runtime [Settings](../src/paperscout/config.py) class maps them to lowercase fields. + +### Slack and server + +| Variable | Required | Default / example | Meaning | +| ---------------------- | --------------- | ----------------- | ----------------------------------------------------- | +| `SLACK_SIGNING_SECRET` | Yes (for Slack) | — | Slack app signing secret; verifies incoming requests. | +| `SLACK_BOT_TOKEN` | Yes (for Slack) | — | Bot User OAuth token (`xoxb-…`). | +| `PORT` | No | `3000` | Port for the Slack Bolt HTTP listener. | + +### Database + +| Variable | Required | Meaning | +| -------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `DATABASE_URL` | Yes | PostgreSQL DSN, e.g. `postgresql://user:pass@host:5432/paperscout`. In Docker against host Postgres, `host.docker.internal` is typical (see `.env.example`). | + +### Scheduling and sources + +| Variable | Default | Meaning | +| ------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------- | +| `POLL_INTERVAL_MINUTES` | `30` | Target wall-clock spacing between poll cycles (see [Scheduling](#scheduling-asyncio-loop) below). | +| `POLL_OVERRUN_COOLDOWN_SECONDS` | `300` | **Minimum** sleep after any cycle that ran longer than one interval — avoids hammering the network if a cycle overruns. | +| `ENABLE_BULK_WG21` | `true` | Fetch and parse wg21.link index each cycle when enabled. | +| `ENABLE_BULK_OPENSTD` | `true` | Reserved for open-std.org bulk fetch (not yet wired into the scheduler). | +| `ENABLE_ISO_PROBE` | `true` | Run isocpp.org HEAD probing each cycle when enabled. | + +### Probe prefixes / extensions + +| Variable | Default | Meaning | +| ------------------ | ------------------ | ---------------------------------------------------- | +| `PROBE_PREFIXES` | `["D","P"]` | JSON list of URL prefixes for gap / unknown numbers. | +| `PROBE_EXTENSIONS` | `[".pdf",".html"]` | JSON list of file extensions to probe. | + +### Frontier + +| Variable | Default | Meaning | +| -------------------------- | ------- | -------------------------------------------------------------------------------------------------------- | +| `FRONTIER_WINDOW_ABOVE` | `60` | How many P-numbers above the effective frontier to treat as hot each cycle. | +| `FRONTIER_WINDOW_BELOW` | `30` | How many below the frontier window. | +| `FRONTIER_EXPLICIT_RANGES` | `[]` | JSON list of `{"min": n, "max": m}` extra hot ranges. | +| `FRONTIER_GAP_THRESHOLD` | `50` | Max gap between consecutive P-numbers before a number is treated as an outlier for frontier calculation. | + +### Hot / cold probing + +| Variable | Default | Meaning | +| --------------------- | ------- | ----------------------------------------------------------------------------------------- | +| `HOT_LOOKBACK_MONTHS` | `6` | Papers with index dates in this window are probed every cycle (hot). | +| `HOT_REVISION_DEPTH` | `2` | Extra revision indices ahead of the known latest for hot numbers. | +| `COLD_REVISION_DEPTH` | `1` | Revisions ahead of known latest for cold pool. | +| `COLD_CYCLE_DIVISOR` | `48` | Cold pool split into this many slices; one slice per cycle (48×30 min ≈ 24 h full sweep). | +| `GAP_MAX_REV` | `1` | For gap/unknown numbers, probe revisions `R0` … `R` this value. | + +### Alerting and HTTP client + +| Variable | Default | Meaning | +| ---------------------- | ------- | ------------------------------------------------------------------------------------------ | +| `ALERT_MODIFIED_HOURS` | `24` | Only Slack-notify probe hits whose `Last-Modified` is within this many hours (see README). | +| `HTTP_CONCURRENCY` | `20` | Max concurrent async HTTP requests for probing. | +| `HTTP_TIMEOUT_SECONDS` | `10` | Per-request timeout. | +| `HTTP_USE_HTTP2` | `true` | Use HTTP/2 where supported. | + +### Notifications + +| Variable | Default | Meaning | +| ------------------------- | ------- | --------------------------------------------------------------------------------------- | +| `NOTIFICATION_CHANNEL` | empty | Slack channel ID for shared alerts (frontier, D→P, etc.); empty disables channel posts. | +| `NOTIFY_ON_FRONTIER_HIT` | `true` | Notify on recent draft hits near the frontier. | +| `NOTIFY_ON_ANY_DRAFT` | `true` | Notify on other recent draft hits. | +| `NOTIFY_ON_DP_TRANSITION` | `true` | Notify when a tracked D URL’s paper appears as P in the index. | + +### Storage and logging + +| Variable | Default | Meaning | +| -------------------- | -------- | ------------------------------------------------------------- | +| `DATA_DIR` | `./data` | Log directory (and local file layout); created if missing. | +| `CACHE_TTL_HOURS` | `1` | Staleness window for cached wg21 index blob in Postgres. | +| `LOG_LEVEL` | `INFO` | Console/file log level (`DEBUG`, `INFO`, `WARNING`, `ERROR`). | +| `LOG_RETENTION_DAYS` | `7` | Days of rotated log files to retain. | + +**Note:** `health_port` (default `8080`) exists in [Settings](../src/paperscout/config.py) but is not listed in `.env.example`. You can still set `HEALTH_PORT` in `.env` if you need to override the default. + +## Scheduling (asyncio loop) + +The [`Scheduler`](../src/paperscout/monitor.py) runs inside the same asyncio event loop as the HTTP clients: + +1. **`seed()`** (first cycle only): loads the wg21 index if enabled, snapshots papers, optionally runs one ISO probe pass and records discoveries — **no Slack notifications** on this pass. +2. **`poll_once()`** on later cycles: refresh index, diff against previous snapshot, run ISO probe if enabled, compute D→P transitions, match watchlists, invoke the notify callback with a `PollResult`. +3. **`run_forever()`** loop: + - `interval = POLL_INTERVAL_MINUTES * 60` seconds (target spacing). + - After each `poll_once()`, measure elapsed time. + - `sleep_for = max(interval - elapsed, POLL_OVERRUN_COOLDOWN_SECONDS)` then `await asyncio.sleep(sleep_for)`. + - So: short cycles wait out the remainder of the interval; **long or failed cycles** still sleep at least `POLL_OVERRUN_COOLDOWN_SECONDS` before retrying. + +**Hot vs cold probing** (what runs inside each cycle) is documented in the README: [Two-Frequency Probing Strategy](../README.md#two-frequency-probing-strategy). + +## Where to go next + +- Maintainer context and ops notes: [handoff.md](handoff.md) +- Contributing and releases: [CONTRIBUTING.md](../CONTRIBUTING.md) +- Product and Slack: [README](../README.md) diff --git a/pyproject.toml b/pyproject.toml index 316f9c5..7687f70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ dev = [ "pytest", "pytest-asyncio", "pytest-cov", + "pre-commit", + "ruff", ] [tool.hatch.build.targets.wheel] @@ -42,3 +44,24 @@ exclude_lines = [ "if __name__ == .__main__.:", "raise NotImplementedError", ] + +[tool.ruff] +target-version = "py310" +line-length = 100 +src = ["src", "tests"] + +[tool.ruff.lint] +select = [ + "E", + "W", + "F", + "I", + "B", + "UP", +] +ignore = [ + "E501", +] + +[tool.ruff.format] +quote-style = "double" diff --git a/src/paperscout/__init__.py b/src/paperscout/__init__.py index 7c69677..014fd27 100644 --- a/src/paperscout/__init__.py +++ b/src/paperscout/__init__.py @@ -1,4 +1,4 @@ -from importlib.metadata import version, PackageNotFoundError +from importlib.metadata import PackageNotFoundError, version try: __version__ = version("paperscout") diff --git a/src/paperscout/__main__.py b/src/paperscout/__main__.py index b3c7a3a..d94a5c6 100644 --- a/src/paperscout/__main__.py +++ b/src/paperscout/__main__.py @@ -1,4 +1,5 @@ """Entry point: python -m paperscout""" + from __future__ import annotations import asyncio @@ -6,23 +7,21 @@ import logging.handlers import sys import threading -from pathlib import Path - from datetime import datetime, timezone +from pathlib import Path from .config import settings -from .scout import MessageQueue, create_app, notify_channel, notify_users, register_handlers from .db import init_db, init_pool from .health import start_health_server from .monitor import Scheduler +from .scout import MessageQueue, create_app, notify_channel, notify_users, register_handlers from .sources import ISOProber, WG21Index from .storage import ProbeState, UserWatchlist log = logging.getLogger("paperscout") -def _setup_logging(data_dir: Path, console_level: str = "INFO", - retention_days: int = 7) -> None: +def _setup_logging(data_dir: Path, console_level: str = "INFO", retention_days: int = 7) -> None: """Configure root logger with: • Console (stderr) — at *console_level*, for interactive monitoring. @@ -57,8 +56,7 @@ def _setup_logging(data_dir: Path, console_level: str = "INFO", root.addHandler(fh) root.addHandler(ch) - for lib in ("httpx", "httpcore", "slack_bolt", "slack_sdk", - "urllib3", "psycopg2"): + for lib in ("httpx", "httpcore", "slack_bolt", "slack_sdk", "urllib3", "psycopg2"): logging.getLogger(lib).setLevel(logging.WARNING) @@ -74,15 +72,20 @@ async def _async_main() -> None: log.info( "=== Paperscout starting port=%d poll=%dmin data=%s log=%s ===", - settings.port, settings.poll_interval_minutes, - data_dir, data_dir / "paperscout.log", + settings.port, + settings.poll_interval_minutes, + data_dir, + data_dir / "paperscout.log", ) log.info( "Settings: hot_lookback=%dmo hot_depth=%d cold_divisor=%d " "alert_hours=%d gap_max_rev=%d frontier_gap=%d", - settings.hot_lookback_months, settings.hot_revision_depth, - settings.cold_cycle_divisor, settings.alert_modified_hours, - settings.gap_max_rev, settings.frontier_gap_threshold, + settings.hot_lookback_months, + settings.hot_revision_depth, + settings.cold_cycle_divisor, + settings.alert_modified_hours, + settings.gap_max_rev, + settings.frontier_gap_threshold, ) if not settings.database_url: @@ -102,7 +105,8 @@ async def _async_main() -> None: mq = MessageQueue(app) mq.start() - paper_count_fn = lambda: len(index.papers) + def paper_count_fn() -> int: + return len(index.papers) def _on_poll_result(result): notify_channel(app, result, mq) @@ -121,7 +125,9 @@ def _on_poll_result(result): start_health_server(settings.health_port, launch_time, state, paper_count_fn) log.info("Starting Slack Bolt app on port %d", settings.port) bolt_thread = threading.Thread( - target=app.start, kwargs={"port": settings.port}, daemon=True, + target=app.start, + kwargs={"port": settings.port}, + daemon=True, ) bolt_thread.start() diff --git a/src/paperscout/config.py b/src/paperscout/config.py index 6ed93e6..23c0dd3 100644 --- a/src/paperscout/config.py +++ b/src/paperscout/config.py @@ -23,7 +23,7 @@ class Settings(BaseSettings): poll_interval_minutes: int = 30 # Minimum seconds to sleep after an overrun cycle (poll took longer than # poll_interval_minutes). Acts as a short cooldown before the next cycle. - poll_overrun_cooldown_seconds: int = 300 # 5 min + poll_overrun_cooldown_seconds: int = Field(default=300, ge=1) # 5 min enable_bulk_wg21: bool = True enable_bulk_openstd: bool = True enable_iso_probe: bool = True diff --git a/src/paperscout/db.py b/src/paperscout/db.py index f6b9285..98068dd 100644 --- a/src/paperscout/db.py +++ b/src/paperscout/db.py @@ -1,9 +1,9 @@ """PostgreSQL connection pool and schema initialisation.""" + from __future__ import annotations import logging -import psycopg2 from psycopg2 import pool as pg_pool log = logging.getLogger(__name__) diff --git a/src/paperscout/health.py b/src/paperscout/health.py index 99376c6..2dc2485 100644 --- a/src/paperscout/health.py +++ b/src/paperscout/health.py @@ -1,12 +1,13 @@ """Lightweight HTTP health-check endpoint.""" + from __future__ import annotations import json import logging import threading +from collections.abc import Callable from datetime import datetime, timezone -from http.server import HTTPServer, BaseHTTPRequestHandler -from typing import Callable +from http.server import BaseHTTPRequestHandler, HTTPServer from . import __version__ @@ -32,18 +33,21 @@ def do_GET(self) -> None: get_disc = getattr(self.state, "get_all_discovered", lambda: {}) discovered = get_disc() - body = json.dumps({ - "version": __version__, - "uptime_seconds": int(uptime), - "launched_at": self.launch_time.isoformat(), - "papers_loaded": self.paper_count_fn(), - "last_poll": ( - datetime.fromtimestamp(last_poll, tz=timezone.utc).isoformat() - if last_poll else None - ), - "discovered_via_probe": len(discovered), - "iso_probe_enabled": settings.enable_iso_probe, - }).encode() + body = json.dumps( + { + "version": __version__, + "uptime_seconds": int(uptime), + "launched_at": self.launch_time.isoformat(), + "papers_loaded": self.paper_count_fn(), + "last_poll": ( + datetime.fromtimestamp(last_poll, tz=timezone.utc).isoformat() + if last_poll + else None + ), + "discovered_via_probe": len(discovered), + "iso_probe_enabled": settings.enable_iso_probe, + } + ).encode() self.send_response(200) self.send_header("Content-Type", "application/json") diff --git a/src/paperscout/monitor.py b/src/paperscout/monitor.py index 3387efb..7f49458 100644 --- a/src/paperscout/monitor.py +++ b/src/paperscout/monitor.py @@ -16,6 +16,7 @@ # ── Diff Engine ────────────────────────────────────────────────────────────── + @dataclass(slots=True) class DiffResult: new_papers: list[Paper] @@ -35,8 +36,12 @@ def diff_snapshots( new_papers.append(paper) else: old = previous[key] - if (old.title != paper.title or old.author != paper.author - or old.date != paper.date or old.long_link != paper.long_link): + if ( + old.title != paper.title + or old.author != paper.author + or old.date != paper.date + or old.long_link != paper.long_link + ): updated_papers.append(paper) new_papers.sort(key=lambda p: p.date or "", reverse=True) @@ -45,6 +50,7 @@ def diff_snapshots( # ── Per-User Matches ───────────────────────────────────────────────────────── + @dataclass class PerUserMatches: """Watchlist matches for a single Slack user in one poll cycle. @@ -52,12 +58,14 @@ class PerUserMatches: Each entry in *papers* and *probe_hits* is a ``(item, match_reason)`` tuple where ``match_reason`` is ``'author'`` or ``'paper'``. """ + papers: list[tuple[Paper, str]] = field(default_factory=list) probe_hits: list[tuple[ProbeHit, str]] = field(default_factory=list) # ── Poll Result ────────────────────────────────────────────────────────────── + @dataclass(slots=True) class DPTransition: """A D-paper draft that has been formally published as its P counterpart. @@ -67,6 +75,7 @@ class DPTransition: *last_modified -- server Last-Modified of the draft (Unix timestamp), or None *discovered_at* -- our wall-clock time when we first found the draft """ + paper: Paper draft_url: str last_modified: float | None @@ -89,6 +98,7 @@ def __init__( # ── Scheduler ──────────────────────────────────────────────────────────────── + class Scheduler: """Coordinates periodic polling: index refresh + ISO probing + notifications.""" @@ -160,13 +170,17 @@ async def poll_once(self) -> PollResult: for paper in diff.new_papers: log.info( "INDEX-NEW id=%-14s author=%-20s date=%s title=%r", - paper.id, paper.author or "?", paper.date or "?", + paper.id, + paper.author or "?", + paper.date or "?", (paper.title or "")[:80], ) for paper in diff.updated_papers: log.debug( "INDEX-UPD id=%-14s author=%-20s date=%s", - paper.id, paper.author or "?", paper.date or "?", + paper.id, + paper.author or "?", + paper.date or "?", ) probe_hits: list[ProbeHit] = [] @@ -174,13 +188,14 @@ async def poll_once(self) -> PollResult: probe_hits = await self.prober.run_cycle() recent_hits = [h for h in probe_hits if h.is_recent] - old_hits = [h for h in probe_hits if not h.is_recent] + old_hits = [h for h in probe_hits if not h.is_recent] if old_hits: log.info( "PROBE-OLD %d hits with Last-Modified outside %dh window " "(recorded to discovered, no alert)", - len(old_hits), self.cfg.alert_modified_hours, + len(old_hits), + self.cfg.alert_modified_hours, ) # D→P transitions @@ -189,28 +204,29 @@ async def poll_once(self) -> PollResult: if paper.number is None or paper.revision is None or paper.prefix != "P": continue for ext in self.cfg.probe_extensions: - d_url = ( - f"https://isocpp.org/files/papers/" - f"D{paper.number:04d}R{paper.revision}{ext}" - ) + d_url = f"https://isocpp.org/files/papers/D{paper.number:04d}R{paper.revision}{ext}" info = self.state.discovered_info(d_url) if info is not None: - dp_transitions.append(DPTransition( - paper=paper, - draft_url=d_url, - last_modified=info.get("last_modified"), - discovered_at=info.get("discovered_at", 0.0), - )) + dp_transitions.append( + DPTransition( + paper=paper, + draft_url=d_url, + last_modified=info.get("last_modified"), + discovered_at=info.get("discovered_at", 0.0), + ) + ) lm_ts = info.get("last_modified") disc_ts = info.get("discovered_at", 0.0) log.info( - "D-TO-P id=%s draft=%s " - "draft-lm=%s draft-discovered=%s", - paper.id, d_url, + "D-TO-P id=%s draft=%s draft-lm=%s draft-discovered=%s", + paper.id, + d_url, datetime.fromtimestamp(lm_ts, tz=timezone.utc).strftime("%Y-%m-%d") - if lm_ts else "unknown", + if lm_ts + else "unknown", datetime.fromtimestamp(disc_ts, tz=timezone.utc).strftime("%Y-%m-%d") - if disc_ts else "unknown", + if disc_ts + else "unknown", ) break @@ -224,7 +240,9 @@ async def poll_once(self) -> PollResult: for uid, m in per_user_matches.items(): log.info( "WATCHLIST-MATCH user=%s papers=%d probe_hits=%d", - uid, len(m.papers), len(m.probe_hits), + uid, + len(m.papers), + len(m.probe_hits), ) result = PollResult( @@ -242,21 +260,26 @@ async def poll_once(self) -> PollResult: "index-new=%d index-upd=%d " "probe-recent=%d probe-old=%d " "dp-transitions=%d users-notified=%d", - self._poll_count, elapsed, - len(diff.new_papers), len(diff.updated_papers), - len(recent_hits), len(old_hits), - len(dp_transitions), len(per_user_matches), + self._poll_count, + elapsed, + len(diff.new_papers), + len(diff.updated_papers), + len(recent_hits), + len(old_hits), + len(dp_transitions), + len(per_user_matches), ) return result async def run_forever(self) -> None: - interval = self.cfg.poll_interval_minutes * 60 - cooldown = self.cfg.poll_overrun_cooldown_seconds + interval = self.cfg.poll_interval_minutes * 60 + cooldown = self.cfg.poll_overrun_cooldown_seconds log.info( - "SCHEDULER-START interval=%dmin overrun_cooldown=%ds " - "iso_probe=%s wg21=%s", - self.cfg.poll_interval_minutes, cooldown, - self.cfg.enable_iso_probe, self.cfg.enable_bulk_wg21, + "SCHEDULER-START interval=%dmin overrun_cooldown=%ds iso_probe=%s wg21=%s", + self.cfg.poll_interval_minutes, + cooldown, + self.cfg.enable_iso_probe, + self.cfg.enable_bulk_wg21, ) while True: t0 = time.monotonic() @@ -269,6 +292,8 @@ async def run_forever(self) -> None: sleep_for = max(interval - elapsed, cooldown) log.info( "SCHEDULER-SLEEP sleep=%.0fs (poll=%.0fs interval=%ds)", - sleep_for, elapsed, interval, + sleep_for, + elapsed, + interval, ) await asyncio.sleep(sleep_for) diff --git a/src/paperscout/scout.py b/src/paperscout/scout.py index 18f071a..8da8ce5 100644 --- a/src/paperscout/scout.py +++ b/src/paperscout/scout.py @@ -11,8 +11,8 @@ from .config import settings from .models import Paper -from .monitor import DPTransition, PerUserMatches, PollResult -from .sources import ProbeHit, Tier +from .monitor import PollResult +from .sources import Tier from .storage import ProbeState, UserWatchlist log = logging.getLogger(__name__) @@ -30,6 +30,7 @@ def create_app() -> App: # ── Message Queue ───────────────────────────────────────────────────────────── + class MessageQueue: """Thread-safe, rate-limited Slack ``chat.postMessage`` queue. @@ -90,7 +91,8 @@ def _send_with_retry(self, channel: str, text: str, kwargs: dict) -> None: retry_after = int(exc.response.headers.get("Retry-After", "5")) log.warning( "MQ 429 rate-limited channel=%s retry_after=%ds", - channel, retry_after, + channel, + retry_after, ) time.sleep(retry_after) # Re-throttle per-channel timer after sleeping @@ -106,6 +108,7 @@ def _send_with_retry(self, channel: str, text: str, kwargs: dict) -> None: # ── Helpers ─────────────────────────────────────────────────────────────────── + def _paper_link(paper: Paper) -> str: url = paper.url or paper.long_link if not url: @@ -134,6 +137,7 @@ def _fmt_lm(lm: datetime | None) -> str: # ── Channel notification ────────────────────────────────────────────────────── + def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: """Post batch/non-watchlist events to the configured notification channel.""" channel = settings.notification_channel @@ -150,11 +154,13 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: d_link = f"<{tr.draft_url}|draft>" disc_str = ( datetime.fromtimestamp(tr.discovered_at, tz=timezone.utc).strftime("%Y-%m-%d") - if tr.discovered_at else "?" + if tr.discovered_at + else "?" ) lm_str = _fmt_lm( datetime.fromtimestamp(tr.last_modified, tz=timezone.utc) - if tr.last_modified else None + if tr.last_modified + else None ) lines.append( f"• {p_link} — {tr.paper.title}" @@ -164,7 +170,7 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: # Frontier probe hits frontier_hits = [h for h in result.probe_hits if h.tier == Tier.FRONTIER] - other_hits = [h for h in result.probe_hits if h.tier != Tier.FRONTIER] + other_hits = [h for h in result.probe_hits if h.tier != Tier.FRONTIER] if settings.notify_on_frontier_hit and frontier_hits: lines.append(f"*:mag: {len(frontier_hits)} new frontier draft(s):*") @@ -186,8 +192,11 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: batches = _batch_lines(lines, SLACK_MAX_TEXT) log.info( "NOTIFY channel=%s messages=%d dp=%d frontier=%d other=%d", - channel, len(batches), - len(result.dp_transitions), len(frontier_hits), len(other_hits), + channel, + len(batches), + len(result.dp_transitions), + len(frontier_hits), + len(other_hits), ) for batch in batches: mq.enqueue(channel, batch) @@ -195,6 +204,7 @@ def notify_channel(app: App, result: PollResult, mq: MessageQueue) -> None: # ── Per-user DM notifications ───────────────────────────────────────────────── + def notify_users(app: App, result: PollResult, mq: MessageQueue) -> None: """Send DMs to users whose watchlist matched new papers or probe hits.""" if not result.per_user_matches: @@ -224,7 +234,10 @@ def notify_users(app: App, result: PollResult, mq: MessageQueue) -> None: batches = _batch_lines(lines, SLACK_MAX_TEXT) log.info( "NOTIFY-USER user=%s messages=%d papers=%d hits=%d", - user_id, len(batches), len(matches.papers), len(matches.probe_hits), + user_id, + len(batches), + len(matches.papers), + len(matches.probe_hits), ) for batch in batches: mq.enqueue(user_id, batch) @@ -249,6 +262,7 @@ def _batch_lines(lines: list[str], max_len: int) -> list[str]: # ── Command handlers ────────────────────────────────────────────────────────── + def register_handlers( app: App, user_watchlist: UserWatchlist, @@ -256,7 +270,6 @@ def register_handlers( paper_count_fn, launch_time: datetime | None = None, ) -> None: - def _dispatch(text: str, user_id: str, channel_type: str, say, reply_opts: dict) -> None: words = [w for w in text.split() if w] if not words: @@ -413,9 +426,13 @@ def _show_watchlist( def _handle_status(state: ProbeState, paper_count_fn, say, reply_opts: dict) -> None: - from datetime import datetime as _dt, timezone as _tz + from datetime import datetime as _dt + from datetime import timezone as _tz + last = state.last_poll - last_str = _dt.fromtimestamp(last, tz=_tz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") if last else "never" + last_str = ( + _dt.fromtimestamp(last, tz=_tz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") if last else "never" + ) say( text=( f"*Paperscout Status*\n" @@ -433,6 +450,7 @@ def _handle_status(state: ProbeState, paper_count_fn, say, reply_opts: dict) -> def _handle_version(say, reply_opts: dict) -> None: from . import __version__ + say(text=f"Paperscout v{__version__}", **reply_opts) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 721f342..823137a 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -4,11 +4,12 @@ import logging import re import time -from enum import Enum -from typing import Iterable +from collections.abc import Iterable from dataclasses import dataclass, field from datetime import date, datetime, timedelta, timezone from email.utils import parsedate_to_datetime +from enum import Enum + import httpx from .config import Settings, settings @@ -31,9 +32,9 @@ class WG21Index: def __init__(self, pool): self._cache = PaperCache(pool, ttl_hours=settings.cache_ttl_hours) self.papers: dict[str, Paper] = {} - self._max_rev: dict[int, int] = {} # P-number -> highest revision - self._max_p: int = 0 # absolute highest P-number - self._sorted_p_nums: list[int] = [] # sorted unique P-numbers, for gap analysis + self._max_rev: dict[int, int] = {} # P-number -> highest revision + self._max_p: int = 0 # absolute highest P-number + self._sorted_p_nums: list[int] = [] # sorted unique P-numbers, for gap analysis async def refresh(self) -> dict[str, Paper]: cached = self._cache.read_if_fresh() @@ -255,12 +256,12 @@ class ISOProber: # Keys that _stats is reset to at the start of every run_cycle(). _STATS_TEMPLATE: dict[str, int] = { "skipped_discovered": 0, # URL already in probe_state - "skipped_in_index": 0, # paper_id already in wg21.link index - "miss": 0, # server returned non-200 - "hit_recent": 0, # 200 + Last-Modified within alert window - "hit_old": 0, # 200 + Last-Modified outside alert window - "hit_no_lm": 0, # 200 + no Last-Modified header (treated as recent) - "error": 0, # httpx / network exception + "skipped_in_index": 0, # paper_id already in wg21.link index + "miss": 0, # server returned non-200 + "hit_recent": 0, # 200 + Last-Modified within alert window + "hit_old": 0, # 200 + Last-Modified outside alert window + "hit_no_lm": 0, # 200 + no Last-Modified header (treated as recent) + "error": 0, # httpx / network exception } def __init__( @@ -285,15 +286,17 @@ async def run_cycle(self) -> list[ProbeHit]: t0 = time.monotonic() urls = self._build_probe_list() - hot_count = sum( - 1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT) - ) + hot_count = sum(1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT)) cold_count = sum(1 for u in urls if u[1] == Tier.COLD) slice_idx = (self._cycle - 1) % self.cfg.cold_cycle_divisor log.info( "PROBE-START cycle=%d total=%d hot=%d cold=%d slice=%d/%d", - self._cycle, len(urls), hot_count, cold_count, - slice_idx, self.cfg.cold_cycle_divisor, + self._cycle, + len(urls), + hot_count, + cold_count, + slice_idx, + self.cfg.cold_cycle_divisor, ) sem = asyncio.Semaphore(self.cfg.http_concurrency) @@ -329,9 +332,17 @@ async def run_cycle(self) -> list[ProbeHit]: "PROBE-DONE cycle=%d elapsed=%.1fs total=%d " "hit=%d(recent=%d old=%d no-lm=%d) miss=%d " "skip-disc=%d skip-idx=%d err=%d", - self._cycle, elapsed, len(urls), - hit_total, s["hit_recent"], s["hit_old"], s["hit_no_lm"], - s["miss"], s["skipped_discovered"], s["skipped_in_index"], s["error"], + self._cycle, + elapsed, + len(urls), + hit_total, + s["hit_recent"], + s["hit_old"], + s["hit_no_lm"], + s["miss"], + s["skipped_discovered"], + s["skipped_in_index"], + s["error"], ) return hits @@ -343,9 +354,8 @@ def _build_probe_list(self) -> list[_Entry]: extra_p_numbers=self.state.paper_nums_from_discovered_iso_urls(), ) hot_known, hot_unknown = self._hot_numbers(frontier) - return ( - self._build_hot_list(frontier, hot_known, hot_unknown) - + self._build_cold_slice(self._cycle, frontier, hot_known, hot_unknown) + return self._build_hot_list(frontier, hot_known, hot_unknown) + self._build_cold_slice( + self._cycle, frontier, hot_known, hot_unknown ) def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: @@ -364,9 +374,7 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: # Recently active papers if self.cfg.hot_lookback_months > 0: - cutoff = date.today() - timedelta( - days=int(self.cfg.hot_lookback_months * 30.44) - ) + cutoff = date.today() - timedelta(days=int(self.cfg.hot_lookback_months * 30.44)) for p in self.index.papers.values(): if p.prefix != "P" or p.number is None or not p.date or p.date == "unknown": continue @@ -491,7 +499,7 @@ async def _probe_one( break except httpx.HTTPError as exc: if _attempt < _max_retries - 1: - await asyncio.sleep(0.5 * (2 ** _attempt)) + await asyncio.sleep(0.5 * (2**_attempt)) continue log.debug("ERR %s %s (after %d attempts)", url, exc, _max_retries) self._stats["error"] += 1 @@ -512,9 +520,7 @@ async def _probe_one( try: last_modified = parsedate_to_datetime(lm_str) threshold = timedelta(hours=self.cfg.alert_modified_hours) - is_recent = ( - datetime.now(timezone.utc) - last_modified - ) <= threshold + is_recent = (datetime.now(timezone.utc) - last_modified) <= threshold except Exception: pass else: @@ -522,13 +528,13 @@ async def _probe_one( # file; treat as recent so we don't silently drop it. is_recent = True - lm_display = ( - last_modified.strftime("%Y-%m-%d %H:%M UTC") - if last_modified else "no-lm" - ) + lm_display = last_modified.strftime("%Y-%m-%d %H:%M UTC") if last_modified else "no-lm" log.info( "HIT tier=%-10s recent=%-5s lm=%-20s %s", - tier, is_recent, lm_display, url, + tier, + is_recent, + lm_display, + url, ) if is_recent and last_modified is not None: @@ -544,8 +550,12 @@ async def _probe_one( front_text = await _fetch_front_text(client, prefix, num, rev) return ProbeHit( - url=url, prefix=prefix, number=num, - revision=rev, extension=ext, tier=tier, + url=url, + prefix=prefix, + number=num, + revision=rev, + extension=ext, + tier=tier, front_text=front_text, last_modified=last_modified, is_recent=is_recent, @@ -578,7 +588,8 @@ async def scrape_open_std(year: int | None = None) -> list[OpenStdEntry]: url = OPEN_STD_URL.format(year=year) try: async with httpx.AsyncClient( - http2=settings.http_use_http2, timeout=30.0, + http2=settings.http_use_http2, + timeout=30.0, ) as client: resp = await client.get(url) resp.raise_for_status() @@ -603,8 +614,13 @@ def _parse_open_std_html(html: str) -> list[OpenStdEntry]: author = re.sub(r"<[^>]+>", "", cells[2]).strip() doc_date = re.sub(r"<[^>]+>", "", cells[3]).strip() subgroup = re.sub(r"<[^>]+>", "", cells[6]).strip() if len(cells) > 6 else "" - entries.append(OpenStdEntry( - paper_id=paper_id, title=title, author=author, - doc_date=doc_date, subgroup=subgroup, - )) + entries.append( + OpenStdEntry( + paper_id=paper_id, + title=title, + author=author, + doc_date=doc_date, + subgroup=subgroup, + ) + ) return entries diff --git a/src/paperscout/storage.py b/src/paperscout/storage.py index 523fb5a..4c2cc31 100644 --- a/src/paperscout/storage.py +++ b/src/paperscout/storage.py @@ -1,16 +1,20 @@ """PostgreSQL-backed storage: PaperCache, ProbeState, UserWatchlist.""" + from __future__ import annotations import json import logging import re import time +from collections.abc import Generator from contextlib import contextmanager -from typing import TYPE_CHECKING, Generator +from typing import TYPE_CHECKING if TYPE_CHECKING: from psycopg2.pool import ThreadedConnectionPool + from .monitor import PerUserMatches + log = logging.getLogger(__name__) # isocpp.org draft URLs (same path shape as ISOProber) @@ -28,8 +32,9 @@ def iso_paper_number_from_discovered_url(url: str) -> int | None: # ── Connection helper ──────────────────────────────────────────────────────── + @contextmanager -def _conn(pool: "ThreadedConnectionPool") -> Generator: +def _conn(pool: ThreadedConnectionPool) -> Generator: conn = pool.getconn() try: yield conn @@ -54,7 +59,7 @@ class PaperCache: changes. """ - def __init__(self, pool: "ThreadedConnectionPool", ttl_hours: float = 1.0): + def __init__(self, pool: ThreadedConnectionPool, ttl_hours: float = 1.0): self._pool = pool self.ttl_seconds = ttl_hours * 3600 @@ -113,13 +118,14 @@ def write(self, data: dict) -> None: # ── Probe State ────────────────────────────────────────────────────────────── + class ProbeState: """PostgreSQL-backed probe state: discovered URLs, miss counters, last-poll. All existing methods are preserved with identical signatures. """ - def __init__(self, pool: "ThreadedConnectionPool"): + def __init__(self, pool: ThreadedConnectionPool): self._pool = pool self._ensure_poll_row() @@ -143,10 +149,7 @@ def get_all_discovered(self) -> dict[str, dict]: with conn.cursor() as cur: cur.execute("SELECT url, last_modified, discovered_at FROM discovered_urls") rows = cur.fetchall() - return { - url: {"last_modified": lm, "discovered_at": da} - for url, lm, da in rows - } + return {url: {"last_modified": lm, "discovered_at": da} for url, lm, da in rows} def mark_discovered(self, url: str, last_modified_ts: float | None = None) -> None: with _conn(self._pool) as conn: @@ -271,6 +274,7 @@ def save(self) -> None: # ── User Watchlist ─────────────────────────────────────────────────────────── + class UserWatchlist: """Per-user watchlist stored in the ``user_watchlist`` table. @@ -279,7 +283,7 @@ class UserWatchlist: auto-detected: pure digit strings → paper, anything else → author. """ - def __init__(self, pool: "ThreadedConnectionPool"): + def __init__(self, pool: ThreadedConnectionPool): self._pool = pool @staticmethod @@ -330,9 +334,7 @@ def get_all_watched_paper_nums(self) -> set[int]: """Return the union of all watched paper numbers across all users.""" with _conn(self._pool) as conn: with conn.cursor() as cur: - cur.execute( - "SELECT entry FROM user_watchlist WHERE entry_type = 'paper'" - ) + cur.execute("SELECT entry FROM user_watchlist WHERE entry_type = 'paper'") rows = cur.fetchall() result: set[int] = set() for (entry,) in rows: @@ -346,16 +348,14 @@ def _get_all_entries(self) -> list[tuple[str, str, str]]: """Return all rows as ``[(slack_user_id, entry, entry_type)]``.""" with _conn(self._pool) as conn: with conn.cursor() as cur: - cur.execute( - "SELECT slack_user_id, entry, entry_type FROM user_watchlist" - ) + cur.execute("SELECT slack_user_id, entry, entry_type FROM user_watchlist") return [(row[0], row[1], row[2]) for row in cur.fetchall()] def matches_for_users( self, - new_papers: list, # list[Paper] - probe_hits: list, # list[ProbeHit] - ) -> dict[str, "PerUserMatches"]: + new_papers: list, # list[Paper] + probe_hits: list, # list[ProbeHit] + ) -> dict[str, PerUserMatches]: """Compute per-user matched papers and probe hits. Returns a dict keyed by ``slack_user_id``. Only users with at least diff --git a/tests/conftest.py b/tests/conftest.py index 80dda73..522fb2a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,15 +3,14 @@ from __future__ import annotations import json as _json +import tempfile from pathlib import Path import pytest from paperscout.config import Settings -from paperscout.models import Paper -from paperscout.storage import ProbeState, UserWatchlist from paperscout.sources import WG21Index -import tempfile +from paperscout.storage import ProbeState # ── FakePool ───────────────────────────────────────────────────────────────── # An in-memory substitute for psycopg2.pool.ThreadedConnectionPool that @@ -116,9 +115,7 @@ def execute(self, sql: str, params=()): self._s.watchlist[key] = etype self.rowcount = 1 - elif ( - "DELETE FROM USER_WATCHLIST WHERE SLACK_USER_ID" in su and "AND ENTRY" in su - ): + elif "DELETE FROM USER_WATCHLIST WHERE SLACK_USER_ID" in su and "AND ENTRY" in su: uid, entry = params[0], params[1] key = (uid, entry) if key in self._s.watchlist: @@ -131,9 +128,7 @@ def execute(self, sql: str, params=()): self._rows = sorted(rows, key=lambda x: (x[1], x[0])) elif "SELECT ENTRY FROM USER_WATCHLIST WHERE ENTRY_TYPE" in su: - self._rows = [ - (e,) for (_, e), t in self._s.watchlist.items() if t == "paper" - ] + self._rows = [(e,) for (_, e), t in self._s.watchlist.items() if t == "paper"] elif "SELECT SLACK_USER_ID, ENTRY, ENTRY_TYPE FROM USER_WATCHLIST" in su: self._rows = [(u, e, t) for (u, e), t in self._s.watchlist.items()] diff --git a/tests/test_health.py b/tests/test_health.py index 9e2dcaf..25b8487 100644 --- a/tests/test_health.py +++ b/tests/test_health.py @@ -1,4 +1,5 @@ """Tests for paperscout.health.""" + from __future__ import annotations import json @@ -12,6 +13,7 @@ def _find_free_port() -> int: import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] diff --git a/tests/test_models.py b/tests/test_models.py index 831b1c0..8f209a8 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,13 +1,14 @@ """Tests for paperscout.models.""" + from __future__ import annotations import pytest from paperscout.models import FileExt, Paper, PaperPrefix, PaperType - # ── Enum sanity ────────────────────────────────────────────────────────────── + def test_paper_prefix_values(): assert PaperPrefix.D == "D" assert PaperPrefix.P == "P" @@ -36,6 +37,7 @@ def test_file_ext_values(): # ── Paper properties: P-prefix ─────────────────────────────────────────────── + def test_paper_p_number_prefix_revision(): p = Paper(id="P2300R10") assert p.number == 2300 @@ -59,6 +61,7 @@ def test_paper_d_prefix(): # ── Paper properties: N-prefix ─────────────────────────────────────────────── + def test_paper_n_number(): p = Paper(id="N4950") assert p.number == 4950 @@ -68,13 +71,17 @@ def test_paper_n_number(): # ── Paper properties: issue tracker prefixes ───────────────────────────────── -@pytest.mark.parametrize("paper_id,expected_prefix,expected_num", [ - ("CWG123", "CWG", 123), - ("EWG456", "EWG", 456), - ("LWG789", "LWG", 789), - ("LEWG42", "LEWG", 42), - ("FS10", "FS", 10), -]) + +@pytest.mark.parametrize( + "paper_id,expected_prefix,expected_num", + [ + ("CWG123", "CWG", 123), + ("EWG456", "EWG", 456), + ("LWG789", "LWG", 789), + ("LEWG42", "LEWG", 42), + ("FS10", "FS", 10), + ], +) def test_paper_issue_prefixes(paper_id, expected_prefix, expected_num): p = Paper(id=paper_id) assert p.prefix == expected_prefix @@ -84,6 +91,7 @@ def test_paper_issue_prefixes(paper_id, expected_prefix, expected_num): # ── Paper properties: unknown IDs ──────────────────────────────────────────── + def test_paper_unknown_id(): p = Paper(id="UNKNOWN") assert p.number is None @@ -100,6 +108,7 @@ def test_paper_empty_id(): # ── Paper.from_index_entry ─────────────────────────────────────────────────── + def test_from_index_entry_full(): entry = { "title": "Test Paper", @@ -176,6 +185,7 @@ def test_from_index_entry_standing_document_type(): # ── Paper dataclass defaults ────────────────────────────────────────────────── + def test_paper_default_fields(): p = Paper(id="P1234R0") assert p.title == "" diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 4fbcfbe..cf63ec4 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -1,8 +1,9 @@ """Tests for paperscout.monitor.""" + from __future__ import annotations import asyncio -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -23,8 +24,12 @@ def _recent_hit(**kwargs) -> ProbeHit: defaults = dict( url="https://isocpp.org/files/papers/D9999R0.pdf", - prefix="D", number=9999, revision=0, extension=".pdf", - tier="frontier", is_recent=True, + prefix="D", + number=9999, + revision=0, + extension=".pdf", + tier="frontier", + is_recent=True, ) defaults.update(kwargs) return ProbeHit(**defaults) @@ -33,8 +38,12 @@ def _recent_hit(**kwargs) -> ProbeHit: def _old_hit(**kwargs) -> ProbeHit: defaults = dict( url="https://isocpp.org/files/papers/D8888R0.pdf", - prefix="D", number=8888, revision=0, extension=".pdf", - tier="cold", is_recent=False, + prefix="D", + number=8888, + revision=0, + extension=".pdf", + tier="cold", + is_recent=False, last_modified=datetime.now(timezone.utc) - timedelta(days=30), ) defaults.update(kwargs) @@ -43,6 +52,7 @@ def _old_hit(**kwargs) -> ProbeHit: # ── diff_snapshots ──────────────────────────────────────────────────────────── + class TestDiffSnapshots: def _paper(self, pid, **kwargs) -> Paper: defaults = dict(title="T", author="A", date="2024-01-01") @@ -89,8 +99,8 @@ def test_new_papers_sorted_by_date_descending(self): prev = {} curr = { "P2300R10": self._paper("P2300R10", date="2024-01-01"), - "P2301R0": self._paper("P2301R0", date="2024-06-01"), - "P2302R0": self._paper("P2302R0", date="2024-03-01"), + "P2301R0": self._paper("P2301R0", date="2024-06-01"), + "P2302R0": self._paper("P2302R0", date="2024-03-01"), } result = diff_snapshots(prev, curr) dates = [p.date for p in result.new_papers] @@ -103,6 +113,7 @@ def test_empty_to_empty(self): # ── PollResult ──────────────────────────────────────────────────────────────── + class TestPollResult: def test_defaults(self): diff = DiffResult(new_papers=[], updated_papers=[]) @@ -112,6 +123,7 @@ def test_defaults(self): def test_explicit_dp_transitions(self): from paperscout.monitor import DPTransition + diff = DiffResult(new_papers=[], updated_papers=[]) paper = Paper(id="P2300R11") tr = DPTransition(paper=paper, draft_url="http://x", last_modified=None, discovered_at=0.0) @@ -128,6 +140,7 @@ def test_explicit_per_user_matches(self): # ── Scheduler ───────────────────────────────────────────────────────────────── + def _make_scheduler(fake_pool, **cfg_overrides): index = MagicMock(spec=WG21Index) index.refresh = AsyncMock() @@ -139,8 +152,11 @@ def _make_scheduler(fake_pool, **cfg_overrides): state = ProbeState(fake_pool) cfg = make_test_settings(**cfg_overrides) scheduler = Scheduler( - index=index, prober=prober, - user_watchlist=user_watchlist, state=state, cfg=cfg, + index=index, + prober=prober, + user_watchlist=user_watchlist, + state=state, + cfg=cfg, ) return scheduler, index, prober, user_watchlist, state @@ -187,8 +203,9 @@ async def test_poll_once_detects_dp_transition(self, fake_pool): draft_url = "https://isocpp.org/files/papers/D9999R0.pdf" state.mark_discovered(draft_url, last_modified_ts=1_700_000_000.0) - new_paper = Paper(id="P9999R0", title="New Published Paper", - author="Author", date="2025-01-01") + new_paper = Paper( + id="P9999R0", title="New Published Paper", author="Author", date="2025-01-01" + ) index.papers = {"P9999R0": new_paper} prober.run_cycle = AsyncMock(return_value=[]) @@ -223,6 +240,7 @@ async def test_poll_once_no_dp_transition_when_no_draft(self, fake_pool): async def test_poll_once_dp_transition_logged(self, fake_pool, caplog): import logging + scheduler, index, prober, _, state = _make_scheduler(fake_pool) await scheduler.poll_once() @@ -247,6 +265,7 @@ async def test_poll_count_increments(self, fake_pool): async def test_poll_once_logs_updated_papers(self, fake_pool, caplog): import logging + scheduler, index, prober, _, _ = _make_scheduler(fake_pool) await scheduler.poll_once() @@ -261,6 +280,7 @@ async def test_poll_once_logs_updated_papers(self, fake_pool, caplog): async def test_poll_old_hits_logged(self, fake_pool, caplog): import logging + scheduler, index, prober, _, _ = _make_scheduler(fake_pool) await scheduler.poll_once() old = _old_hit() @@ -305,8 +325,8 @@ async def test_poll_once_calls_notify_callback(self, fake_pool): notified = [] scheduler, _, _, _, _ = _make_scheduler(fake_pool) scheduler.notify_callback = notified.append - await scheduler.poll_once() # seed - await scheduler.poll_once() # real poll + await scheduler.poll_once() # seed + await scheduler.poll_once() # real poll assert len(notified) == 1 async def test_poll_once_skips_refresh_when_disabled(self, fake_pool): diff --git a/tests/test_scout.py b/tests/test_scout.py index 5f10869..d4b13a6 100644 --- a/tests/test_scout.py +++ b/tests/test_scout.py @@ -1,17 +1,13 @@ """Tests for paperscout.scout.""" + from __future__ import annotations -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock, patch -import pytest - from paperscout.models import Paper from paperscout.monitor import DiffResult, DPTransition, PerUserMatches, PollResult -from paperscout.sources import ProbeHit -from paperscout.storage import ProbeState, UserWatchlist from paperscout.scout import ( - MessageQueue, _batch_lines, _fmt_lm, _format_uptime, @@ -27,10 +23,12 @@ notify_users, register_handlers, ) - +from paperscout.sources import ProbeHit +from paperscout.storage import ProbeState, UserWatchlist # ── Helpers ─────────────────────────────────────────────────────────────────── + def _empty_diff() -> DiffResult: return DiffResult(new_papers=[], updated_papers=[]) @@ -70,8 +68,12 @@ def _make_settings(channel="C123456", **overrides): def _recent_hit(tier="frontier", number=9999, **kwargs) -> ProbeHit: defaults = dict( url=f"https://isocpp.org/files/papers/D{number:04d}R0.pdf", - prefix="D", number=number, revision=0, extension=".pdf", - tier=tier, is_recent=True, + prefix="D", + number=number, + revision=0, + extension=".pdf", + tier=tier, + is_recent=True, last_modified=datetime.now(timezone.utc) - timedelta(hours=2), ) defaults.update(kwargs) @@ -80,6 +82,7 @@ def _recent_hit(tier="frontier", number=9999, **kwargs) -> ProbeHit: # ── _fmt_lm ─────────────────────────────────────────────────────────────────── + class TestFmtLm: def test_none(self): assert "unknown" in _fmt_lm(None) @@ -99,6 +102,7 @@ def test_days_ago_shows_date(self): # ── _paper_link / _hit_label ────────────────────────────────────────────────── + class TestHelpers: def test_paper_link_uses_url(self): paper = Paper(id="P2300R10", url="https://wg21.link/P2300R10") @@ -115,13 +119,13 @@ def test_paper_link_synthesises_wg21_url(self): assert "|P2300R10>" in link def test_hit_label(self): - label = _hit_label("https://isocpp.org/files/papers/D2300R11.pdf", - "D", 2300, 11, ".pdf") + label = _hit_label("https://isocpp.org/files/papers/D2300R11.pdf", "D", 2300, 11, ".pdf") assert label == "" # ── notify_channel ──────────────────────────────────────────────────────────── + class TestNotifyChannel: def test_no_channel_returns_silently(self): app = MagicMock() @@ -203,11 +207,18 @@ def test_last_modified_shown_in_batch(self): def test_dp_all_transitions_are_batched(self): app = MagicMock() mq = MagicMock() - paper = Paper(id="P2300R11", title="Senders", author="Unknown Author", - url="https://wg21.link/P2300R11") - tr = DPTransition(paper=paper, - draft_url="https://isocpp.org/files/papers/D2300R11.pdf", - last_modified=1_700_000_000.0, discovered_at=1_699_900_000.0) + paper = Paper( + id="P2300R11", + title="Senders", + author="Unknown Author", + url="https://wg21.link/P2300R11", + ) + tr = DPTransition( + paper=paper, + draft_url="https://isocpp.org/files/papers/D2300R11.pdf", + last_modified=1_700_000_000.0, + discovered_at=1_699_900_000.0, + ) result = _make_result(dp_transitions=[tr]) with patch("paperscout.scout.settings", _make_settings()): notify_channel(app, result, mq) @@ -220,9 +231,12 @@ def test_dp_suppressed_when_disabled(self): app = MagicMock() mq = MagicMock() paper = Paper(id="P2300R11", title="X", author="Y") - tr = DPTransition(paper=paper, - draft_url="https://isocpp.org/files/papers/D2300R11.pdf", - last_modified=None, discovered_at=0.0) + tr = DPTransition( + paper=paper, + draft_url="https://isocpp.org/files/papers/D2300R11.pdf", + last_modified=None, + discovered_at=0.0, + ) result = _make_result(dp_transitions=[tr]) with patch("paperscout.scout.settings", _make_settings(notify_on_dp_transition=False)): notify_channel(app, result, mq) @@ -232,9 +246,12 @@ def test_dp_batch_contains_draft_link(self): app = MagicMock() mq = MagicMock() paper = Paper(id="P9999R0", title="Foo", author="Bar", url="") - tr = DPTransition(paper=paper, - draft_url="https://isocpp.org/files/papers/D9999R0.pdf", - last_modified=None, discovered_at=0.0) + tr = DPTransition( + paper=paper, + draft_url="https://isocpp.org/files/papers/D9999R0.pdf", + last_modified=None, + discovered_at=0.0, + ) result = _make_result(dp_transitions=[tr]) with patch("paperscout.scout.settings", _make_settings()): notify_channel(app, result, mq) @@ -245,6 +262,7 @@ def test_dp_batch_contains_draft_link(self): # ── notify_users ────────────────────────────────────────────────────────────── + class TestNotifyUsers: def test_no_matches_posts_nothing(self): app = MagicMock() @@ -256,8 +274,9 @@ def test_no_matches_posts_nothing(self): def test_author_match_sends_dm(self): app = MagicMock() mq = MagicMock() - paper = Paper(id="P2300R11", title="Senders", author="Eric Niebler", - url="https://wg21.link/P2300R11") + paper = Paper( + id="P2300R11", title="Senders", author="Eric Niebler", url="https://wg21.link/P2300R11" + ) pum = PerUserMatches(papers=[(paper, "author")], probe_hits=[]) result = _make_result(per_user_matches={"U123": pum}) notify_users(app, result, mq) @@ -270,8 +289,7 @@ def test_author_match_sends_dm(self): def test_paper_match_sends_dm(self): app = MagicMock() mq = MagicMock() - paper = Paper(id="P2300R11", title="X", author="Someone", - url="https://wg21.link/P2300R11") + paper = Paper(id="P2300R11", title="X", author="Someone", url="https://wg21.link/P2300R11") pum = PerUserMatches(papers=[(paper, "paper")], probe_hits=[]) result = _make_result(per_user_matches={"U456": pum}) notify_users(app, result, mq) @@ -304,6 +322,7 @@ def test_multiple_users_get_separate_dms(self): # ── _batch_lines ────────────────────────────────────────────────────────────── + class TestBatchLines: def test_single_batch_when_small(self): batches = _batch_lines(["line1", "line2", "line3"], max_len=1000) @@ -324,6 +343,7 @@ def test_single_line_exceeding_limit(self): # ── _reply_opts ─────────────────────────────────────────────────────────────── + class TestReplyOpts: def test_no_thread(self): opts = _reply_opts({"ts": "123"}) @@ -337,6 +357,7 @@ def test_with_thread(self): # ── _handle_watchlist ───────────────────────────────────────────────────────── + class TestHandleWatchlist: def test_add_new_author(self, fake_pool): say = MagicMock() @@ -415,6 +436,7 @@ def test_reply_opts_forwarded(self, fake_pool): # ── _show_watchlist ─────────────────────────────────────────────────────────── + class TestShowWatchlist: def test_empty_watchlist(self, fake_pool): say = MagicMock() @@ -441,6 +463,7 @@ def test_shows_type_labels(self, fake_pool): # ── _handle_status ──────────────────────────────────────────────────────────── + class TestHandleStatus: def test_status_never_polled(self, fake_pool): state = ProbeState(fake_pool) @@ -462,6 +485,7 @@ def test_status_after_poll(self, fake_pool): # ── register_handlers ───────────────────────────────────────────────────────── + class TestRegisterHandlers: def _setup(self, fake_pool): app = MagicMock() @@ -471,6 +495,7 @@ def capture_event(name): def decorator(fn): registered[name] = fn return fn + return decorator app.event.side_effect = capture_event @@ -556,8 +581,12 @@ def test_app_mention_channel_watchlist_silently_ignored(self, fake_pool): registered, _, _ = self._setup(fake_pool) say = MagicMock() registered["app_mention"]( - event={"text": "<@U1> watchlist list", "ts": "1", - "channel_type": "channel", "user": "U1"}, + event={ + "text": "<@U1> watchlist list", + "ts": "1", + "channel_type": "channel", + "user": "U1", + }, context={"bot_user_id": "U1"}, say=say, ) @@ -567,8 +596,12 @@ def test_message_mpim_watchlist_gets_error(self, fake_pool): registered, _, _ = self._setup(fake_pool) say = MagicMock() registered["message"]( - event={"text": "<@U1> watchlist add niebler", "channel_type": "mpim", - "ts": "1", "user": "U1"}, + event={ + "text": "<@U1> watchlist add niebler", + "channel_type": "mpim", + "ts": "1", + "user": "U1", + }, context={"bot_user_id": "U1"}, say=say, ) @@ -580,8 +613,7 @@ def test_message_mpim_status_responds(self, fake_pool): say = MagicMock() with patch("paperscout.scout.settings", _make_settings()): registered["message"]( - event={"text": "<@U1> status", "channel_type": "mpim", - "ts": "1", "user": "U1"}, + event={"text": "<@U1> status", "channel_type": "mpim", "ts": "1", "user": "U1"}, context={"bot_user_id": "U1"}, say=say, ) @@ -592,8 +624,12 @@ def test_message_subtype_ignored(self, fake_pool): registered, _, _ = self._setup(fake_pool) say = MagicMock() registered["message"]( - event={"text": "status", "subtype": "message_changed", "channel_type": "im", - "user": "U1"}, + event={ + "text": "status", + "subtype": "message_changed", + "channel_type": "im", + "user": "U1", + }, context={"bot_user_id": "U1"}, say=say, ) @@ -663,6 +699,7 @@ def test_dispatch_empty_text(self, fake_pool): # ── _handle_version ─────────────────────────────────────────────────────────── + class TestHandleVersion: def test_version_contains_version_string(self): say = MagicMock() @@ -680,6 +717,7 @@ def test_version_forwards_reply_opts(self): # ── _format_uptime / _handle_uptime ────────────────────────────────────────── + class TestUptime: def test_format_uptime_minutes_only(self): delta = timedelta(minutes=5) @@ -724,6 +762,7 @@ def test_handle_uptime_forwards_reply_opts(self): # ── dispatch: version / uptime ──────────────────────────────────────────────── + class TestDispatchVersionUptime: def _setup(self, fake_pool, launch_time=None): app = MagicMock() @@ -733,6 +772,7 @@ def capture_event(name): def decorator(fn): registered[name] = fn return fn + return decorator app.event.side_effect = capture_event @@ -781,9 +821,11 @@ def test_help_mentions_version_and_uptime(self, fake_pool): # ── create_app ──────────────────────────────────────────────────────────────── + class TestCreateApp: def test_create_app_uses_settings(self): from paperscout.scout import create_app + mock_settings = MagicMock() mock_settings.slack_bot_token = "xoxb-test" mock_settings.slack_signing_secret = "secret" diff --git a/tests/test_sources.py b/tests/test_sources.py index a08c684..95adfe3 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -1,4 +1,5 @@ """Tests for paperscout.sources.""" + from __future__ import annotations import asyncio @@ -12,8 +13,6 @@ from paperscout.models import Paper from paperscout.sources import ( ISOProber, - OpenStdEntry, - ProbeHit, WG21Index, _fetch_front_text, _fetch_pdf_text, @@ -33,8 +32,10 @@ def _mock_wl(paper_nums=None): # ── Helpers ────────────────────────────────────────────────────────────────── -def _make_response(status: int = 200, json_data=None, text: str = "", - last_modified: datetime | None = None) -> MagicMock: + +def _make_response( + status: int = 200, json_data=None, text: str = "", last_modified: datetime | None = None +) -> MagicMock: resp = MagicMock() resp.status_code = status resp.json = MagicMock(return_value=json_data or {}) @@ -57,7 +58,7 @@ def _make_stream_cm(status: int = 404, chunks: list[bytes] | None = None) -> Asy resp.status_code = status async def _aiter_bytes(chunk_size=65536): - for chunk in (chunks or []): + for chunk in chunks or []: yield chunk resp.aiter_bytes = _aiter_bytes @@ -85,6 +86,7 @@ def _old_lm() -> datetime: # ── WG21Index ──────────────────────────────────────────────────────────────── + class TestWG21Index: async def test_refresh_downloads_when_no_cache(self, fake_pool): index = WG21Index(fake_pool) @@ -228,6 +230,7 @@ def test_parse_ignores_non_dict_entries(self, fake_pool): # ── _fetch_front_text ───────────────────────────────────────────────────────── + class TestFetchFrontText: async def test_returns_plain_text_on_success(self): html = "

Author: Eric Niebler

" @@ -258,6 +261,7 @@ async def test_truncates_to_1000_words(self): # ── _fetch_pdf_text ─────────────────────────────────────────────────────────── + class TestFetchPdfText: async def test_returns_empty_on_non_200(self): client = _make_async_client(stream_cm=_make_stream_cm(404)) @@ -266,6 +270,7 @@ async def test_returns_empty_on_non_200(self): async def test_returns_empty_when_fitz_missing(self): import sys + client = _make_async_client(stream_cm=_make_stream_cm(200, chunks=[b"%PDF-fake"])) with patch.dict(sys.modules, {"fitz": None}): result = await _fetch_pdf_text(client, "https://example.com/test.pdf") @@ -280,6 +285,7 @@ async def test_returns_empty_on_stream_exception(self): async def test_respects_byte_cap(self): """stream() should be cut off after _PDF_MAX_BYTES; no crash.""" from paperscout.sources import _PDF_MAX_BYTES + big_chunk = b"x" * (_PDF_MAX_BYTES + 1) # Even though the chunk exceeds the cap, _fetch_pdf_text must not raise. # Passing invalid PDF bytes → fitz raises → caught → returns "". @@ -337,8 +343,11 @@ async def test_fetch_front_text_falls_back_to_pdf(self): # ── ISOProber: hot/cold list builders ──────────────────────────────────────── + class TestISOProberLists: - def _make_prober(self, fake_pool, watchlist_nums=None, **cfg_overrides) -> tuple[ISOProber, WG21Index, ProbeState]: + def _make_prober( + self, fake_pool, watchlist_nums=None, **cfg_overrides + ) -> tuple[ISOProber, WG21Index, ProbeState]: index = WG21Index(fake_pool) state = ProbeState(fake_pool) cfg = make_test_settings(**cfg_overrides) @@ -437,9 +446,11 @@ def test_hot_recent_paper_by_date(self, fake_pool): recent_date = (date.today() - timedelta(days=30)).isoformat() # _parse_and_index updates _max_rev/_sorted_p_nums but not self.papers; # assign both so that the date-based hot filter can find the paper. - index.papers = index._parse_and_index({ - "P5000R2": {"title": "T", "date": recent_date, "type": "paper"}, - }) + index.papers = index._parse_and_index( + { + "P5000R2": {"title": "T", "date": recent_date, "type": "paper"}, + } + ) frontier = index.effective_frontier() hot_known, _ = prober._hot_numbers(frontier) assert 5000 in hot_known @@ -452,9 +463,11 @@ def test_hot_old_paper_not_included(self, fake_pool): frontier_window_below=0, ) old_date = (date.today() - timedelta(days=365)).isoformat() - index._parse_and_index({ - "P5000R2": {"title": "T", "date": old_date, "type": "paper"}, - }) + index._parse_and_index( + { + "P5000R2": {"title": "T", "date": old_date, "type": "paper"}, + } + ) frontier = index.effective_frontier() hot_known, _ = prober._hot_numbers(frontier) assert 5000 not in hot_known @@ -555,8 +568,9 @@ def test_hot_numbers_explicit_range(self, fake_pool): assert 200 in hot_unknown or 200 in hot_known def test_hot_paper_skipped_when_no_date(self, fake_pool): - prober, index, _ = self._make_prober(fake_pool, hot_lookback_months=6, - frontier_window_above=0, frontier_window_below=0) + prober, index, _ = self._make_prober( + fake_pool, hot_lookback_months=6, frontier_window_above=0, frontier_window_below=0 + ) # Paper with no date should be silently skipped (the `continue` branch) index.papers = index._parse_and_index({"P6000R0": {"title": "T", "type": "paper"}}) frontier = index.effective_frontier() @@ -564,8 +578,9 @@ def test_hot_paper_skipped_when_no_date(self, fake_pool): assert 6000 not in hot_known def test_hot_paper_skipped_when_bad_date(self, fake_pool): - prober, index, _ = self._make_prober(fake_pool, hot_lookback_months=6, - frontier_window_above=0, frontier_window_below=0) + prober, index, _ = self._make_prober( + fake_pool, hot_lookback_months=6, frontier_window_above=0, frontier_window_below=0 + ) index.papers = index._parse_and_index( {"P6001R0": {"title": "T", "date": "not-a-date", "type": "paper"}} ) @@ -575,8 +590,11 @@ def test_hot_paper_skipped_when_bad_date(self, fake_pool): def test_tier_label_recent_for_non_watchlist_non_frontier(self, fake_pool): prober, index, _ = self._make_prober( - fake_pool, watchlist_nums=[1], hot_lookback_months=0, - frontier_window_above=0, frontier_window_below=0, + fake_pool, + watchlist_nums=[1], + hot_lookback_months=0, + frontier_window_above=0, + frontier_window_below=0, ) self._set_frontier(index, 100) # Number 50 is not watchlist and not in frontier range → "recent" @@ -605,9 +623,13 @@ def test_build_hot_list_explicit_ranges_update_frontier_range(self, fake_pool): def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): """Known hot numbers with latest_revision=None should start from R0.""" prober, index, _ = self._make_prober( - fake_pool, watchlist_nums=[9999], hot_lookback_months=0, - frontier_window_above=0, frontier_window_below=0, - hot_revision_depth=1, gap_max_rev=0, + fake_pool, + watchlist_nums=[9999], + hot_lookback_months=0, + frontier_window_above=0, + frontier_window_below=0, + hot_revision_depth=1, + gap_max_rev=0, ) # Add 9999 to _max_rev so it's "known" but with latest_revision=None index._max_rev = {9999: -1, 99: 0, 100: 0} @@ -622,9 +644,12 @@ def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): def test_cold_known_skips_when_latest_none(self, fake_pool): """cold_known paper with latest_revision=None should be silently skipped.""" prober, index, _ = self._make_prober( - fake_pool, hot_lookback_months=0, - frontier_window_above=0, frontier_window_below=0, # empty frontier range - cold_cycle_divisor=1, cold_revision_depth=1, + fake_pool, + hot_lookback_months=0, + frontier_window_above=0, + frontier_window_below=0, # empty frontier range + cold_cycle_divisor=1, + cold_revision_depth=1, ) # 4 has _max_rev=-1 → latest_revision=None; 5 is normal # With no frontier window and no watchlist, both are cold_known @@ -635,7 +660,7 @@ def test_cold_known_skips_when_latest_none(self, fake_pool): urls = prober._build_cold_slice(1, frontier, hot_known, hot_unknown) cold_nums = {r[3] for r in urls if r[1] == "cold"} assert 4 not in cold_nums # skipped because latest_revision=None - assert 5 in cold_nums # normally probed + assert 5 in cold_nums # normally probed async def test_probe_one_bad_last_modified_header(self, fake_pool): """An unparsable Last-Modified header should not crash; is_recent stays False.""" @@ -662,10 +687,12 @@ def test_cold_excludes_hot_numbers(self, fake_pool): frontier_window_above=0, frontier_window_below=0, ) - index._parse_and_index({ - "P5000R2": {"title": "T", "date": "2020-01-01", "type": "paper"}, - "P5001R0": {"title": "T", "date": "2020-01-01", "type": "paper"}, - }) + index._parse_and_index( + { + "P5000R2": {"title": "T", "date": "2020-01-01", "type": "paper"}, + "P5001R0": {"title": "T", "date": "2020-01-01", "type": "paper"}, + } + ) frontier = index.effective_frontier() hot_known, hot_unknown = prober._hot_numbers(frontier) urls = prober._build_cold_slice(1, frontier, hot_known, hot_unknown) @@ -675,6 +702,7 @@ def test_cold_excludes_hot_numbers(self, fake_pool): # ── ISOProber: _probe_one ───────────────────────────────────────────────────── + class TestISOProberProbeOne: def _make_prober(self, fake_pool) -> tuple[ISOProber, WG21Index, ProbeState]: index = WG21Index(fake_pool) @@ -824,6 +852,7 @@ async def test_stats_error(self, fake_pool): async def test_run_cycle_logs_unhandled_exception(self, fake_pool, caplog): """If asyncio.gather returns an Exception (not ProbeHit), it is logged.""" import logging + index = WG21Index(fake_pool) index._max_p = 100 index._max_rev = {99: 0, 100: 0} @@ -831,9 +860,12 @@ async def test_run_cycle_logs_unhandled_exception(self, fake_pool, caplog): state = ProbeState(fake_pool) cfg = make_test_settings( watchlist_papers=[9999], - hot_lookback_months=0, hot_revision_depth=1, - frontier_window_above=0, frontier_window_below=0, - gap_max_rev=0, cold_cycle_divisor=100, + hot_lookback_months=0, + hot_revision_depth=1, + frontier_window_above=0, + frontier_window_below=0, + gap_max_rev=0, + cold_cycle_divisor=100, ) prober = ISOProber(index, state, user_watchlist=_mock_wl([9999]), cfg=cfg) @@ -882,6 +914,7 @@ async def test_stats_reset_each_cycle(self, fake_pool): # ── ISOProber: run_cycle ────────────────────────────────────────────────────── + class TestISOProberRunCycle: async def test_run_cycle_records_hit_and_marks_discovered(self, fake_pool): index = WG21Index(fake_pool) diff --git a/tests/test_storage.py b/tests/test_storage.py index 8243df8..741a267 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,11 +1,10 @@ """Tests for paperscout.storage (PostgreSQL-backed via FakePool).""" + from __future__ import annotations import time from unittest.mock import patch -import pytest - from paperscout.models import Paper from paperscout.storage import ( PaperCache, @@ -13,11 +12,10 @@ UserWatchlist, iso_paper_number_from_discovered_url, ) -from tests.conftest import FakePool - # ── PaperCache ──────────────────────────────────────────────────────────────── + class TestPaperCache: def test_is_fresh_when_empty(self, fake_pool): cache = PaperCache(fake_pool, ttl_hours=1.0) @@ -77,6 +75,7 @@ def test_write_non_ascii(self, fake_pool): # ── ProbeState ──────────────────────────────────────────────────────────────── + class TestProbeState: def test_initial_state(self, fake_pool): state = ProbeState(fake_pool) @@ -105,12 +104,14 @@ def test_mark_discovered_stores_last_modified(self, fake_pool): assert entry["discovered_at"] > 0 def test_iso_paper_number_from_discovered_url(self): - assert iso_paper_number_from_discovered_url( - "https://isocpp.org/files/papers/D4165R0.pdf" - ) == 4165 - assert iso_paper_number_from_discovered_url( - "https://isocpp.org/files/papers/P1234R0.html" - ) == 1234 + assert ( + iso_paper_number_from_discovered_url("https://isocpp.org/files/papers/D4165R0.pdf") + == 4165 + ) + assert ( + iso_paper_number_from_discovered_url("https://isocpp.org/files/papers/P1234R0.html") + == 1234 + ) assert iso_paper_number_from_discovered_url("https://example.com/") is None def test_paper_nums_from_discovered_iso_urls(self, fake_pool): @@ -221,6 +222,7 @@ def test_miss_counts_property_returns_all(self, fake_pool): # ── UserWatchlist ───────────────────────────────────────────────────────────── + class TestUserWatchlist: def test_add_author_returns_true(self, fake_pool): wl = UserWatchlist(fake_pool) @@ -297,7 +299,6 @@ def test_get_all_watched_paper_nums_union(self, fake_pool): assert nums == {2300, 2301} def test_matches_for_users_author_match(self, fake_pool): - from paperscout.monitor import PerUserMatches wl = UserWatchlist(fake_pool) wl.add("U1", "niebler") paper = Paper(id="P2300R11", title="X", author="Eric Niebler") @@ -307,7 +308,6 @@ def test_matches_for_users_author_match(self, fake_pool): assert paper in matched_papers def test_matches_for_users_paper_match(self, fake_pool): - from paperscout.monitor import PerUserMatches wl = UserWatchlist(fake_pool) wl.add("U1", "2300") paper = Paper(id="P2300R11", title="X", author="Unknown") @@ -328,12 +328,18 @@ def test_matches_for_users_empty_watchlist(self, fake_pool): def test_matches_for_users_probe_hit_author(self, fake_pool): from paperscout.sources import ProbeHit + wl = UserWatchlist(fake_pool) wl.add("U1", "niebler") hit = ProbeHit( url="https://isocpp.org/files/papers/D9999R0.pdf", - prefix="D", number=9999, revision=0, extension=".pdf", - tier="frontier", front_text="written by niebler", is_recent=True, + prefix="D", + number=9999, + revision=0, + extension=".pdf", + tier="frontier", + front_text="written by niebler", + is_recent=True, ) result = wl.matches_for_users([], [hit]) assert "U1" in result @@ -341,12 +347,17 @@ def test_matches_for_users_probe_hit_author(self, fake_pool): def test_matches_for_users_probe_hit_paper_number(self, fake_pool): from paperscout.sources import ProbeHit + wl = UserWatchlist(fake_pool) wl.add("U1", "9999") hit = ProbeHit( url="https://isocpp.org/files/papers/D9999R0.pdf", - prefix="D", number=9999, revision=0, extension=".pdf", - tier="watchlist", is_recent=True, + prefix="D", + number=9999, + revision=0, + extension=".pdf", + tier="watchlist", + is_recent=True, ) result = wl.matches_for_users([], [hit]) assert "U1" in result