diff --git a/changelog.d/release-manifest-consumer.changed.md b/changelog.d/release-manifest-consumer.changed.md new file mode 100644 index 00000000..f958a561 --- /dev/null +++ b/changelog.d/release-manifest-consumer.changed.md @@ -0,0 +1 @@ +Align the bundled UK release manifest with the pinned `policyengine-uk` package version and updated data package revisions. diff --git a/pyproject.toml b/pyproject.toml index 51729c6a..87b0eaa8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,11 +28,11 @@ dependencies = [ [project.optional-dependencies] uk = [ "policyengine_core>=3.23.6", - "policyengine-uk>=2.51.0", + "policyengine-uk==2.78.0", ] us = [ "policyengine_core>=3.23.6", - "policyengine-us>=1.213.1", + "policyengine-us==1.602.0", ] dev = [ "pytest", @@ -45,8 +45,8 @@ dev = [ "pytest-asyncio>=0.26.0", "ruff>=0.9.0", "policyengine_core>=3.23.6", - "policyengine-uk>=2.51.0", - "policyengine-us>=1.213.1", + "policyengine-uk==2.78.0", + "policyengine-us==1.602.0", "towncrier>=24.8.0", "mypy>=1.11.0", "pytest-cov>=5.0.0", diff --git a/src/policyengine/core/__init__.py b/src/policyengine/core/__init__.py index f0b8536c..710024b5 100644 --- a/src/policyengine/core/__init__.py +++ b/src/policyengine/core/__init__.py @@ -12,6 +12,13 @@ from .region import Region as Region from .region import RegionRegistry as RegionRegistry from .region import RegionType as RegionType +from .release_manifest import CountryReleaseManifest as CountryReleaseManifest +from .release_manifest import DataPackageVersion as DataPackageVersion +from .release_manifest import DataReleaseArtifact as DataReleaseArtifact +from .release_manifest import DataReleaseManifest as DataReleaseManifest +from .release_manifest import PackageVersion as PackageVersion +from .release_manifest import get_data_release_manifest as get_data_release_manifest +from .release_manifest import get_release_manifest as get_release_manifest from .scoping_strategy import RegionScopingStrategy as RegionScopingStrategy from .scoping_strategy import RowFilterStrategy as RowFilterStrategy from .scoping_strategy import ScopingStrategy as ScopingStrategy diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py new file mode 100644 index 00000000..c024e922 --- /dev/null +++ b/src/policyengine/core/release_manifest.py @@ -0,0 +1,178 @@ +import os +from functools import lru_cache +from importlib.resources import files +from pathlib import Path + +import requests +from pydantic import BaseModel, Field + +HF_REQUEST_TIMEOUT_SECONDS = 30 + + +class PackageVersion(BaseModel): + name: str + version: str + + +class DataPackageVersion(PackageVersion): + repo_id: str + repo_type: str = "model" + release_manifest_path: str = "release_manifest.json" + + +class CompatibleModelPackage(BaseModel): + name: str + specifier: str + + +class ArtifactPathReference(BaseModel): + path: str + + +class ArtifactPathTemplate(BaseModel): + path_template: str + + def resolve(self, **kwargs: str) -> str: + return self.path_template.format(**kwargs) + + +class DataReleaseArtifact(BaseModel): + kind: str + path: str + repo_id: str + revision: str + sha256: str | None = None + size_bytes: int | None = None + + @property + def uri(self) -> str: + return build_hf_uri( + repo_id=self.repo_id, + path_in_repo=self.path, + revision=self.revision, + ) + + +class DataReleaseManifest(BaseModel): + schema_version: int + data_package: PackageVersion + compatible_model_packages: list[CompatibleModelPackage] = Field( + default_factory=list + ) + default_datasets: dict[str, str] = Field(default_factory=dict) + artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict) + + +class CountryReleaseManifest(BaseModel): + country_id: str + policyengine_version: str + model_package: PackageVersion + data_package: DataPackageVersion + default_dataset: str + datasets: dict[str, ArtifactPathReference] = Field(default_factory=dict) + region_datasets: dict[str, ArtifactPathTemplate] = Field(default_factory=dict) + + @property + def default_dataset_uri(self) -> str: + return resolve_dataset_reference(self.country_id, self.default_dataset) + + +def build_hf_uri(repo_id: str, path_in_repo: str, revision: str) -> str: + return f"hf://{repo_id}/{path_in_repo}@{revision}" + + +@lru_cache +def get_release_manifest(country_id: str) -> CountryReleaseManifest: + manifest_path = files("policyengine").joinpath( + "data", "release_manifests", f"{country_id}.json" + ) + if not manifest_path.is_file(): + raise ValueError(f"No bundled release manifest for country '{country_id}'") + + return CountryReleaseManifest.model_validate_json(manifest_path.read_text()) + + +def _data_release_manifest_url(data_package: DataPackageVersion) -> str: + return ( + "https://huggingface.co/" + f"{data_package.repo_id}/resolve/{data_package.version}/" + f"{data_package.release_manifest_path}" + ) + + +@lru_cache +def get_data_release_manifest(country_id: str) -> DataReleaseManifest: + country_manifest = get_release_manifest(country_id) + data_package = country_manifest.data_package + + headers = {} + token = os.environ.get("HUGGING_FACE_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + + response = requests.get( + _data_release_manifest_url(data_package), + headers=headers, + timeout=HF_REQUEST_TIMEOUT_SECONDS, + ) + if response.status_code in (401, 403): + raise ValueError( + "Could not fetch the data release manifest from Hugging Face. " + "If this country uses a private data repo, set HUGGING_FACE_TOKEN." + ) + response.raise_for_status() + return DataReleaseManifest.model_validate_json(response.text) + + +def resolve_dataset_reference(country_id: str, dataset: str) -> str: + if "://" in dataset: + return dataset + + manifest = get_release_manifest(country_id) + path_reference = manifest.datasets.get(dataset) + if path_reference is not None: + return build_hf_uri( + repo_id=manifest.data_package.repo_id, + path_in_repo=path_reference.path, + revision=manifest.data_package.version, + ) + + data_release_manifest = get_data_release_manifest(country_id) + artifact = data_release_manifest.artifacts.get(dataset) + if artifact is None: + raise ValueError( + f"Unknown dataset '{dataset}' for country '{country_id}'. " + f"Known datasets: {sorted(manifest.datasets)}" + ) + + return artifact.uri + + +def dataset_logical_name(dataset: str) -> str: + return Path(dataset.rsplit("@", 1)[0]).stem + + +def resolve_default_datasets(country_id: str) -> list[str]: + manifest = get_release_manifest(country_id) + return list(manifest.datasets.keys()) + + +def resolve_region_dataset_path( + country_id: str, + region_type: str, + **kwargs: str, +) -> str | None: + manifest = get_release_manifest(country_id) + template = manifest.region_datasets.get(region_type) + if template is None: + return None + + resolved_path = template.resolve(**kwargs) + if "://" in resolved_path: + return resolved_path + + return build_hf_uri( + repo_id=manifest.data_package.repo_id, + path_in_repo=resolved_path, + revision=manifest.data_package.version, + ) diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index ce54a6c7..b9af105d 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -94,3 +94,17 @@ def save(self): def load(self): """Load the simulation's output dataset.""" self.tax_benefit_model_version.load(self) + + @property + def release_bundle(self) -> dict[str, str | None]: + bundle = ( + self.tax_benefit_model_version.release_bundle + if self.tax_benefit_model_version is not None + else {} + ) + return { + **bundle, + "dataset_filepath": self.dataset.filepath + if self.dataset is not None + else None, + } diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index a926e203..7b09dfcc 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, Field +from .release_manifest import CountryReleaseManifest, PackageVersion from .tax_benefit_model import TaxBenefitModel if TYPE_CHECKING: @@ -32,6 +33,13 @@ class TaxBenefitModelVersion(BaseModel): region_registry: "RegionRegistry | None" = Field( default=None, description="Registry of supported geographic regions" ) + release_manifest: CountryReleaseManifest | None = Field( + default=None, + exclude=True, + ) + model_package: PackageVersion | None = Field(default=None) + data_package: PackageVersion | None = Field(default=None) + default_dataset_uri: str | None = Field(default=None) @property def parameter_values(self) -> list["ParameterValue"]: @@ -116,6 +124,28 @@ def get_region(self, code: str) -> "Region | None": return None return self.region_registry.get(code) + @property + def release_bundle(self) -> dict[str, str | None]: + return { + "country_id": self.release_manifest.country_id + if self.release_manifest is not None + else None, + "policyengine_version": self.release_manifest.policyengine_version + if self.release_manifest is not None + else None, + "model_package": self.model_package.name + if self.model_package is not None + else None, + "model_version": self.version, + "data_package": self.data_package.name + if self.data_package is not None + else None, + "data_version": self.data_package.version + if self.data_package is not None + else None, + "default_dataset_uri": self.default_dataset_uri, + } + def __repr__(self) -> str: # Give the id and version, and the number of variables, parameters, parameter nodes, parameter values return f"" diff --git a/src/policyengine/countries/uk/regions.py b/src/policyengine/countries/uk/regions.py index 671990b6..2f100524 100644 --- a/src/policyengine/countries/uk/regions.py +++ b/src/policyengine/countries/uk/regions.py @@ -15,6 +15,7 @@ from typing import TYPE_CHECKING from policyengine.core.region import Region, RegionRegistry +from policyengine.core.release_manifest import resolve_region_dataset_path from policyengine.core.scoping_strategy import ( RowFilterStrategy, WeightReplacementStrategy, @@ -127,7 +128,7 @@ def build_uk_region_registry( code="uk", label="United Kingdom", region_type="national", - dataset_path=f"{UK_DATA_BUCKET}/enhanced_frs_2023_24.h5", + dataset_path=resolve_region_dataset_path("uk", "national"), ) ) diff --git a/src/policyengine/countries/us/regions.py b/src/policyengine/countries/us/regions.py index 93a8cff3..f335805f 100644 --- a/src/policyengine/countries/us/regions.py +++ b/src/policyengine/countries/us/regions.py @@ -8,6 +8,7 @@ """ from policyengine.core.region import Region, RegionRegistry +from policyengine.core.release_manifest import resolve_region_dataset_path from policyengine.core.scoping_strategy import RowFilterStrategy from .data import AT_LARGE_STATES, DISTRICT_COUNTS, US_PLACES, US_STATES @@ -40,7 +41,7 @@ def build_us_region_registry() -> RegionRegistry: code="us", label="United States", region_type="national", - dataset_path=f"{US_DATA_BUCKET}/enhanced_cps_2024.h5", + dataset_path=resolve_region_dataset_path("us", "national"), ) ) @@ -52,7 +53,11 @@ def build_us_region_registry() -> RegionRegistry: label=name, region_type="state", parent_code="us", - dataset_path=f"{US_DATA_BUCKET}/states/{abbrev}.h5", + dataset_path=resolve_region_dataset_path( + "us", + "state", + state_code=abbrev, + ), state_code=abbrev, state_name=name, ) @@ -76,7 +81,11 @@ def build_us_region_registry() -> RegionRegistry: label=label, region_type="congressional_district", parent_code=f"state/{state_abbrev.lower()}", - dataset_path=f"{US_DATA_BUCKET}/districts/{district_code}.h5", + dataset_path=resolve_region_dataset_path( + "us", + "congressional_district", + district_code=district_code, + ), state_code=state_abbrev, state_name=state_name, ) diff --git a/src/policyengine/data/release_manifests/uk.json b/src/policyengine/data/release_manifests/uk.json new file mode 100644 index 00000000..ac1d93dd --- /dev/null +++ b/src/policyengine/data/release_manifests/uk.json @@ -0,0 +1,27 @@ +{ + "country_id": "uk", + "policyengine_version": "3.4.1", + "model_package": { + "name": "policyengine-uk", + "version": "2.78.0" + }, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.3", + "repo_id": "policyengine/policyengine-uk-data-private" + }, + "default_dataset": "enhanced_frs_2023_24", + "datasets": { + "frs_2023_24": { + "path": "frs_2023_24.h5" + }, + "enhanced_frs_2023_24": { + "path": "enhanced_frs_2023_24.h5" + } + }, + "region_datasets": { + "national": { + "path_template": "enhanced_frs_2023_24.h5" + } + } +} diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json new file mode 100644 index 00000000..0ea73808 --- /dev/null +++ b/src/policyengine/data/release_manifests/us.json @@ -0,0 +1,30 @@ +{ + "country_id": "us", + "policyengine_version": "3.4.1", + "model_package": { + "name": "policyengine-us", + "version": "1.602.0" + }, + "data_package": { + "name": "policyengine-us-data", + "version": "1.77.0", + "repo_id": "policyengine/policyengine-us-data" + }, + "default_dataset": "enhanced_cps_2024", + "datasets": { + "enhanced_cps_2024": { + "path": "enhanced_cps_2024.h5" + } + }, + "region_datasets": { + "national": { + "path_template": "enhanced_cps_2024.h5" + }, + "state": { + "path_template": "states/{state_code}.h5" + }, + "congressional_district": { + "path_template": "districts/{district_code}.h5" + } + } +} diff --git a/src/policyengine/tax_benefit_models/uk/datasets.py b/src/policyengine/tax_benefit_models/uk/datasets.py index 442e45a5..ec0f579b 100644 --- a/src/policyengine/tax_benefit_models/uk/datasets.py +++ b/src/policyengine/tax_benefit_models/uk/datasets.py @@ -5,6 +5,10 @@ from pydantic import ConfigDict from policyengine.core import Dataset, YearData +from policyengine.core.release_manifest import ( + dataset_logical_name, + resolve_dataset_reference, +) class UKYearData(YearData): @@ -96,17 +100,19 @@ def __repr__(self) -> str: def create_datasets( datasets: list[str] = [ - "hf://policyengine/policyengine-uk-data/frs_2023_24.h5", - "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", + "frs_2023_24", + "enhanced_frs_2023_24", ], years: list[int] = [2026, 2027, 2028, 2029, 2030], data_folder: str = "./data", ) -> dict[str, PolicyEngineUKDataset]: result = {} for dataset in datasets: + resolved_dataset = resolve_dataset_reference("uk", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) from policyengine_uk import Microsimulation - sim = Microsimulation(dataset=dataset) + sim = Microsimulation(dataset=resolved_dataset) for year in years: year_dataset = sim.dataset[year] @@ -154,10 +160,10 @@ def create_datasets( ) uk_dataset = PolicyEngineUKDataset( - id=f"{Path(dataset).stem}_year_{year}", - name=f"{Path(dataset).stem}-year-{year}", - description=f"UK Dataset for year {year} based on {Path(dataset).stem}", - filepath=f"{data_folder}/{Path(dataset).stem}_year_{year}.h5", + id=f"{dataset_stem}_year_{year}", + name=f"{dataset_stem}-year-{year}", + description=f"UK Dataset for year {year} based on {dataset_stem}", + filepath=f"{data_folder}/{dataset_stem}_year_{year}.h5", year=int(year), data=UKYearData( person=MicroDataFrame(person_df, weights="person_weight"), @@ -167,7 +173,7 @@ def create_datasets( ) uk_dataset.save() - dataset_key = f"{Path(dataset).stem}_{year}" + dataset_key = f"{dataset_stem}_{year}" result[dataset_key] = uk_dataset return result @@ -175,25 +181,27 @@ def create_datasets( def load_datasets( datasets: list[str] = [ - "hf://policyengine/policyengine-uk-data/frs_2023_24.h5", - "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", + "frs_2023_24", + "enhanced_frs_2023_24", ], years: list[int] = [2026, 2027, 2028, 2029, 2030], data_folder: str = "./data", ) -> dict[str, PolicyEngineUKDataset]: result = {} for dataset in datasets: + resolved_dataset = resolve_dataset_reference("uk", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) for year in years: - filepath = f"{data_folder}/{Path(dataset).stem}_year_{year}.h5" + filepath = f"{data_folder}/{dataset_stem}_year_{year}.h5" uk_dataset = PolicyEngineUKDataset( - name=f"{Path(dataset).stem}-year-{year}", - description=f"UK Dataset for year {year} based on {Path(dataset).stem}", + name=f"{dataset_stem}-year-{year}", + description=f"UK Dataset for year {year} based on {dataset_stem}", filepath=filepath, year=int(year), ) uk_dataset.load() - dataset_key = f"{Path(dataset).stem}_{year}" + dataset_key = f"{dataset_stem}_{year}" result[dataset_key] = uk_dataset return result @@ -201,8 +209,8 @@ def load_datasets( def ensure_datasets( datasets: list[str] = [ - "hf://policyengine/policyengine-uk-data/frs_2023_24.h5", - "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", + "frs_2023_24", + "enhanced_frs_2023_24", ], years: list[int] = [2026, 2027, 2028, 2029, 2030], data_folder: str = "./data", @@ -220,8 +228,10 @@ def ensure_datasets( # Check if all dataset files exist all_exist = True for dataset in datasets: + resolved_dataset = resolve_dataset_reference("uk", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) for year in years: - filepath = Path(f"{data_folder}/{Path(dataset).stem}_year_{year}.h5") + filepath = Path(f"{data_folder}/{dataset_stem}_year_{year}.h5") if not filepath.exists(): all_exist = False break diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 8575a795..7b605157 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -15,6 +15,7 @@ TaxBenefitModelVersion, Variable, ) +from policyengine.core.release_manifest import get_release_manifest from policyengine.utils.entity_utils import ( build_entity_relationships, filter_dataset_by_household_variable, @@ -143,14 +144,25 @@ class PolicyEngineUKLatest(TaxBenefitModelVersion): } def __init__(self, **kwargs: dict): - # Lazy-load package metadata if not provided + manifest = get_release_manifest("uk") if "version" not in kwargs or kwargs.get("version") is None: pkg_version, upload_time = _get_uk_package_metadata() kwargs["version"] = pkg_version if upload_time is not None: kwargs["created_at"] = datetime.datetime.fromisoformat(upload_time) + if kwargs["version"] != manifest.model_package.version: + raise RuntimeError( + "Installed policyengine-uk version does not match the bundled " + f"policyengine.py release manifest: {kwargs['version']} != " + f"{manifest.model_package.version}." + ) + super().__init__(**kwargs) + self.release_manifest = manifest + self.model_package = manifest.model_package + self.data_package = manifest.data_package + self.default_dataset_uri = manifest.default_dataset_uri from policyengine_core.enums import Enum from policyengine_uk.system import system diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index 1bbf78f3..7ea12f8e 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -6,6 +6,10 @@ from pydantic import ConfigDict from policyengine.core import Dataset, YearData +from policyengine.core.release_manifest import ( + dataset_logical_name, + resolve_dataset_reference, +) class USYearData(YearData): @@ -97,15 +101,15 @@ def __repr__(self) -> str: def create_datasets( datasets: list[str] = [ - "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + "enhanced_cps_2024", ], years: list[int] = [2024, 2025, 2026, 2027, 2028], data_folder: str = "./data", ) -> dict[str, PolicyEngineUSDataset]: - """Create PolicyEngineUSDataset instances from HuggingFace dataset paths. + """Create PolicyEngineUSDataset instances from logical dataset names or URLs. Args: - datasets: List of HuggingFace dataset paths (e.g., "hf://policyengine/policyengine-us-data/cps_2024.h5") + datasets: List of logical dataset names or HuggingFace dataset URLs years: List of years to extract data for data_folder: Directory to save the dataset files @@ -116,7 +120,9 @@ def create_datasets( result = {} for dataset in datasets: - sim = Microsimulation(dataset=dataset) + resolved_dataset = resolve_dataset_reference("us", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + sim = Microsimulation(dataset=resolved_dataset) for year in years: # Get all input variables from the simulation @@ -255,10 +261,10 @@ def create_datasets( tax_unit_df = entity_df us_dataset = PolicyEngineUSDataset( - id=f"{Path(dataset).stem}_year_{year}", - name=f"{Path(dataset).stem}-year-{year}", - description=f"US Dataset for year {year} based on {Path(dataset).stem}", - filepath=f"{data_folder}/{Path(dataset).stem}_year_{year}.h5", + id=f"{dataset_stem}_year_{year}", + name=f"{dataset_stem}-year-{year}", + description=f"US Dataset for year {year} based on {dataset_stem}", + filepath=f"{data_folder}/{dataset_stem}_year_{year}.h5", year=int(year), data=USYearData( person=MicroDataFrame(person_df, weights="person_weight"), @@ -273,7 +279,7 @@ def create_datasets( ) us_dataset.save() - dataset_key = f"{Path(dataset).stem}_{year}" + dataset_key = f"{dataset_stem}_{year}" result[dataset_key] = us_dataset return result @@ -281,7 +287,7 @@ def create_datasets( def load_datasets( datasets: list[str] = [ - "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + "enhanced_cps_2024", ], years: list[int] = [2024, 2025, 2026, 2027, 2028], data_folder: str = "./data", @@ -298,17 +304,19 @@ def load_datasets( """ result = {} for dataset in datasets: + resolved_dataset = resolve_dataset_reference("us", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) for year in years: - filepath = f"{data_folder}/{Path(dataset).stem}_year_{year}.h5" + filepath = f"{data_folder}/{dataset_stem}_year_{year}.h5" us_dataset = PolicyEngineUSDataset( - name=f"{Path(dataset).stem}-year-{year}", - description=f"US Dataset for year {year} based on {Path(dataset).stem}", + name=f"{dataset_stem}-year-{year}", + description=f"US Dataset for year {year} based on {dataset_stem}", filepath=filepath, year=year, ) us_dataset.load() - dataset_key = f"{Path(dataset).stem}_{year}" + dataset_key = f"{dataset_stem}_{year}" result[dataset_key] = us_dataset return result @@ -316,7 +324,7 @@ def load_datasets( def ensure_datasets( datasets: list[str] = [ - "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + "enhanced_cps_2024", ], years: list[int] = [2024, 2025, 2026, 2027, 2028], data_folder: str = "./data", @@ -334,8 +342,10 @@ def ensure_datasets( # Check if all dataset files exist all_exist = True for dataset in datasets: + resolved_dataset = resolve_dataset_reference("us", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) for year in years: - filepath = Path(f"{data_folder}/{Path(dataset).stem}_year_{year}.h5") + filepath = Path(f"{data_folder}/{dataset_stem}_year_{year}.h5") if not filepath.exists(): all_exist = False break diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 6ebedde2..c7d47a75 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -14,6 +14,7 @@ TaxBenefitModelVersion, Variable, ) +from policyengine.core.release_manifest import get_release_manifest from policyengine.utils.entity_utils import ( build_entity_relationships, filter_dataset_by_household_variable, @@ -48,7 +49,6 @@ class PolicyEngineUS(TaxBenefitModel): def _get_us_package_metadata(): """Get PolicyEngine US package version and upload time (lazy-loaded).""" pkg_version = version("policyengine-us") - # Get published time from PyPI response = requests.get("https://pypi.org/pypi/policyengine-us/json") data = response.json() upload_time = data["releases"][pkg_version][0]["upload_time_iso_8601"] @@ -125,13 +125,24 @@ class PolicyEngineUSLatest(TaxBenefitModelVersion): } def __init__(self, **kwargs: dict): - # Lazy-load package metadata if not provided + manifest = get_release_manifest("us") if "version" not in kwargs or kwargs.get("version") is None: pkg_version, upload_time = _get_us_package_metadata() kwargs["version"] = pkg_version kwargs["created_at"] = datetime.datetime.fromisoformat(upload_time) + if kwargs["version"] != manifest.model_package.version: + raise RuntimeError( + "Installed policyengine-us version does not match the bundled " + f"policyengine.py release manifest: {kwargs['version']} != " + f"{manifest.model_package.version}." + ) + super().__init__(**kwargs) + self.release_manifest = manifest + self.model_package = manifest.model_package + self.data_package = manifest.data_package + self.default_dataset_uri = manifest.default_dataset_uri from policyengine_core.enums import Enum from policyengine_us.system import system diff --git a/tests/test_models.py b/tests/test_models.py index 960cb389..0f0767bc 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -24,6 +24,19 @@ def test_can_get_region_by_code(self): assert england is not None assert england.label == "England" + def test_has_release_manifest_metadata(self): + """UK model should expose its bundled release manifest metadata.""" + assert uk_latest.release_manifest is not None + assert uk_latest.release_manifest.country_id == "uk" + assert uk_latest.model_package.name == "policyengine-uk" + assert uk_latest.model_package.version == "2.78.0" + assert uk_latest.data_package.name == "policyengine-uk-data" + assert uk_latest.data_package.version == "1.40.3" + assert ( + uk_latest.default_dataset_uri + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3" + ) + def test_has_hundreds_of_parameters(self): """UK model should have hundreds of parameters.""" assert len(uk_latest.parameters) >= 100 @@ -95,6 +108,19 @@ def test_can_get_region_by_code(self): assert ca is not None assert ca.label == "California" + def test_has_release_manifest_metadata(self): + """US model should expose its bundled release manifest metadata.""" + assert us_latest.release_manifest is not None + assert us_latest.release_manifest.country_id == "us" + assert us_latest.model_package.name == "policyengine-us" + assert us_latest.model_package.version == "1.602.0" + assert us_latest.data_package.name == "policyengine-us-data" + assert us_latest.data_package.version == "1.77.0" + assert ( + us_latest.default_dataset_uri + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + ) + def test_has_hundreds_of_parameters(self): """US model should have hundreds of parameters.""" assert len(us_latest.parameters) >= 100 diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py new file mode 100644 index 00000000..f2c27c72 --- /dev/null +++ b/tests/test_release_manifests.py @@ -0,0 +1,114 @@ +"""Tests for bundled compatibility manifests and data release manifests.""" + +import json +from unittest.mock import MagicMock, patch + +from policyengine.core.release_manifest import ( + dataset_logical_name, + get_data_release_manifest, + get_release_manifest, + resolve_dataset_reference, +) + + +def _response_with_json(payload: dict) -> MagicMock: + response = MagicMock() + response.status_code = 200 + response.text = json.dumps(payload) + response.raise_for_status.return_value = None + return response + + +class TestReleaseManifests: + """Tests for bundled country manifests.""" + + def teardown_method(self): + get_release_manifest.cache_clear() + get_data_release_manifest.cache_clear() + + def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): + manifest = get_release_manifest("us") + + assert manifest.country_id == "us" + assert manifest.policyengine_version == "3.4.1" + assert manifest.model_package.name == "policyengine-us" + assert manifest.model_package.version == "1.602.0" + assert manifest.data_package.name == "policyengine-us-data" + assert manifest.data_package.version == "1.77.0" + assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" + + def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): + manifest = get_release_manifest("uk") + + assert manifest.country_id == "uk" + assert manifest.policyengine_version == "3.4.1" + assert manifest.model_package.name == "policyengine-uk" + assert manifest.model_package.version == "2.78.0" + assert manifest.data_package.name == "policyengine-uk-data" + assert manifest.data_package.version == "1.40.3" + assert ( + manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" + ) + + def test__given_us_dataset_name__then_resolves_to_versioned_hf_url(self): + resolved = resolve_dataset_reference("us", "enhanced_cps_2024") + + assert ( + resolved + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + ) + + def test__given_uk_dataset_name__then_resolves_to_versioned_hf_url(self): + resolved = resolve_dataset_reference("uk", "enhanced_frs_2023_24") + + assert ( + resolved + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3" + ) + + def test__given_explicit_url__then_resolution_is_noop(self): + url = "hf://policyengine/policyengine-us-data/cps_2023.h5@1.77.0" + + assert resolve_dataset_reference("us", url) == url + + def test__given_versioned_dataset_url__then_logical_name_drops_version(self): + dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + + assert dataset_logical_name(dataset) == "enhanced_cps_2024" + + def test__given_country__then_can_fetch_data_release_manifest(self): + get_data_release_manifest.cache_clear() + payload = { + "schema_version": 1, + "data_package": { + "name": "policyengine-us-data", + "version": "1.77.0", + }, + "compatible_model_packages": [], + "default_datasets": {"national": "enhanced_cps_2024"}, + "artifacts": { + "enhanced_cps_2024": { + "kind": "microdata", + "path": "enhanced_cps_2024.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.77.0", + "sha256": "abc", + "size_bytes": 123, + } + }, + } + + with patch( + "policyengine.core.release_manifest.requests.get", + return_value=_response_with_json(payload), + ) as mock_get: + manifest = get_data_release_manifest("us") + + assert manifest.schema_version == 1 + assert manifest.data_package.name == "policyengine-us-data" + assert manifest.default_datasets["national"] == "enhanced_cps_2024" + assert ( + manifest.artifacts["enhanced_cps_2024"].uri + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + ) + assert mock_get.call_count == 1 diff --git a/tests/test_uk_regions.py b/tests/test_uk_regions.py index be6677b8..02727596 100644 --- a/tests/test_uk_regions.py +++ b/tests/test_uk_regions.py @@ -3,7 +3,6 @@ from policyengine.core.scoping_strategy import RowFilterStrategy from policyengine.countries.uk.regions import ( UK_COUNTRIES, - UK_DATA_BUCKET, build_uk_region_registry, uk_region_registry, ) @@ -67,7 +66,10 @@ def test__given_uk_registry__then_has_national_region(self): assert national.code == "uk" assert national.label == "United Kingdom" assert national.region_type == "national" - assert national.dataset_path == f"{UK_DATA_BUCKET}/enhanced_frs_2023_24.h5" + assert ( + national.dataset_path + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.3" + ) assert not national.requires_filter def test__given_uk_registry__then_has_four_country_regions(self): diff --git a/tests/test_us_regions.py b/tests/test_us_regions.py index 247e9135..4bb8f039 100644 --- a/tests/test_us_regions.py +++ b/tests/test_us_regions.py @@ -2,7 +2,6 @@ from policyengine.countries.us.data import DISTRICT_COUNTS, US_STATES from policyengine.countries.us.regions import ( - US_DATA_BUCKET, us_region_registry, ) @@ -104,7 +103,10 @@ def test__given_us_registry__then_has_national_region(self): assert national.code == "us" assert national.label == "United States" assert national.region_type == "national" - assert national.dataset_path == f"{US_DATA_BUCKET}/enhanced_cps_2024.h5" + assert ( + national.dataset_path + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.77.0" + ) def test__given_us_registry__then_has_51_states(self): """Given: US region registry @@ -130,7 +132,10 @@ def test__given_california_region__then_has_correct_format(self): assert ca.label == "California" assert ca.region_type == "state" assert ca.parent_code == "us" - assert ca.dataset_path == f"{US_DATA_BUCKET}/states/CA.h5" + assert ( + ca.dataset_path + == "hf://policyengine/policyengine-us-data/states/CA.h5@1.77.0" + ) assert ca.state_code == "CA" assert ca.state_name == "California" assert not ca.requires_filter @@ -160,7 +165,10 @@ def test__given_ca_first_district__then_has_correct_format(self): assert "1st" in ca01.label.lower() or "1 " in ca01.label assert ca01.region_type == "congressional_district" assert ca01.parent_code == "state/ca" - assert ca01.dataset_path == f"{US_DATA_BUCKET}/districts/CA-01.h5" + assert ( + ca01.dataset_path + == "hf://policyengine/policyengine-us-data/districts/CA-01.h5@1.77.0" + ) assert ca01.state_code == "CA" assert not ca01.requires_filter