From b96e9d0a1214c9f2f3e3631ca33a0c825fa5171c Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 1 Jun 2026 08:20:35 -0400 Subject: [PATCH] Document UK Hugging Face data access --- README.md | 7 +++++- docs/microsim.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d2c144b3..70496d57 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ from policyengine.core import Simulation from policyengine.outputs.aggregate import Aggregate, AggregateType datasets = pe.uk.ensure_datasets( - datasets=["hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5"], + datasets=["enhanced_frs_2023_24"], years=[2026], data_folder="./data", ) @@ -58,6 +58,11 @@ print(f"Total UC spending: £{agg.result / 1e9:.1f}bn") For baseline-vs-reform comparisons, see `pe.uk.economic_impact_analysis` and its US counterpart. +UK population data is stored in a private Hugging Face model repository. Set +`HUGGING_FACE_TOKEN` to a token from an account with access before running UK +population examples. To download the raw `.h5` file directly, see +[Microsimulation](docs/microsim.md#uk-private-data-and-raw-h5-access). + ## Documentation **Core concepts:** diff --git a/docs/microsim.md b/docs/microsim.md index 9a80059e..d531db6d 100644 --- a/docs/microsim.md +++ b/docs/microsim.md @@ -53,6 +53,62 @@ List datasets already known to the country: pe.us.load_datasets() # or pe.uk.load_datasets() ``` +### UK private data and raw h5 access + +UK population data uses licensed Family Resources Survey inputs. The default +UK release bundle points to the private +`policyengine/policyengine-uk-data-private` Hugging Face model repository. Set +`HUGGING_FACE_TOKEN` to a token from a Hugging Face account with access: + +```bash +export HUGGING_FACE_TOKEN=hf_... +``` + +For `policyengine.py` analyses, use the logical dataset name from the release +bundle. `ensure_datasets` resolves it to the pinned private Hugging Face file, +downloads it, caches it locally, and creates year-specific uprated datasets: + +```python +import policyengine as pe +from policyengine.core import Simulation + +datasets = pe.uk.ensure_datasets( + datasets=["enhanced_frs_2023_24"], + years=[2026], + data_folder="./data", +) +dataset = datasets["enhanced_frs_2023_24_2026"] + +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.uk.model, +) +simulation.run() +``` + +To download the raw h5 artifact directly from Hugging Face, use +`huggingface_hub` and specify `repo_type="model"`: + +```python +import os +from huggingface_hub import hf_hub_download + +path = hf_hub_download( + repo_id="policyengine/policyengine-uk-data-private", + filename="enhanced_frs_2023_24.h5", + repo_type="model", + token=os.environ["HUGGING_FACE_TOKEN"], +) + +print(path) +``` + +The repository URL is +. A 404 from +the website or `RepositoryNotFoundError` from the Hub API usually means the +browser or token is not authenticated as an account with access, or that the +Hub call omitted `repo_type="model"`. + ## Simulations A `Simulation` needs a dataset, a tax-benefit model version, and optionally a policy (reform):