From b14ad5137188b3b2c1733dd9117da127c2bbbfac Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 2 Jun 2026 13:58:25 -0400 Subject: [PATCH] Remove manual support sign controls from donor imputation --- pyproject.toml | 2 +- src/microplex_us/pipelines/donor_imputers.py | 16 +-- src/microplex_us/pipelines/us.py | 12 +-- src/microplex_us/variables.py | 100 ++++++------------ .../test_scf_net_worth_components.py | 1 - .../test_regime_aware_donor_imputer.py | 24 +++-- tests/pipelines/test_us.py | 8 +- tests/pipelines/test_zi_qrf_backend.py | 8 +- tests/test_variables.py | 36 +++---- uv.lock | 10 +- 10 files changed, 78 insertions(+), 139 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6eff96e..6d0c869 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ r2 = [ "boto3>=1.34", ] policyengine = [ - "microimpute==1.15.1 ; python_full_version >= '3.12' and python_full_version < '3.15'", + "microimpute @ git+https://github.com/PolicyEngine/microimpute.git@27d23090dcf04c2c30ae369b0a209e27eb3659f0 ; python_full_version >= '3.12' and python_full_version < '3.15'", "policyengine-us==1.715.2; python_version >= '3.11' and python_version < '3.15'", "spm-calculator>=0.3.1", # Standalone tax-unit construction engine (the extraction of eCPS's diff --git a/src/microplex_us/pipelines/donor_imputers.py b/src/microplex_us/pipelines/donor_imputers.py index bc1bc3f..724ef20 100644 --- a/src/microplex_us/pipelines/donor_imputers.py +++ b/src/microplex_us/pipelines/donor_imputers.py @@ -168,19 +168,13 @@ def __init__( condition_vars: list[str], target_vars: list[str], n_estimators: int = 100, - nonnegative_vars: set[str] | None = None, classifier_type: str = "hist_gb", - min_class_count: int = 10, - min_class_fraction: float = 0.01, seed: int = 42, ) -> None: self.condition_vars = list(condition_vars) self.target_vars = list(target_vars) self.n_estimators = int(n_estimators) - self.nonnegative_vars = set(nonnegative_vars or ()) self.classifier_type = str(classifier_type) - self.min_class_count = int(min_class_count) - self.min_class_fraction = float(min_class_fraction) self.seed = int(seed) self._fitted: dict[str, Any] = {} self._regimes: dict[str, str] = {} @@ -204,8 +198,7 @@ def fit( ) if importlib.util.find_spec("quantile_forest") is None: raise ImportError( - "quantile-forest is required for the RegimeAwareDonorImputer " - "base QRF." + "quantile-forest is required for the RegimeAwareDonorImputer base QRF." ) from microimpute.models.qrf import QRF @@ -224,8 +217,6 @@ def fit( wrapper = ZeroInflatedImputer( base_imputer_class=QRF, base_imputer_kwargs={}, - min_class_count=self.min_class_count, - min_class_fraction=self.min_class_fraction, classifier_type=self.classifier_type, seed=self.seed, ) @@ -256,10 +247,7 @@ def generate( ) self._reset_prediction_rngs(fitted, seed=column_seed) preds = fitted.predict(synthetic[self.condition_vars]) - values = preds[column].to_numpy(dtype=float) - if column in self.nonnegative_vars: - values = np.maximum(values, 0.0) - synthetic[column] = values + synthetic[column] = preds[column].to_numpy(dtype=float) return synthetic def _reset_prediction_rngs( diff --git a/src/microplex_us/pipelines/us.py b/src/microplex_us/pipelines/us.py index 3c718a2..e9354da 100644 --- a/src/microplex_us/pipelines/us.py +++ b/src/microplex_us/pipelines/us.py @@ -5291,18 +5291,13 @@ def _build_donor_imputer( nonnegative_vars = { variable for variable, support_family in support_families.items() - if support_family - in { - VariableSupportFamily.ZERO_INFLATED_POSITIVE, - VariableSupportFamily.BOUNDED_SHARE, - } + if support_family is VariableSupportFamily.BOUNDED_SHARE } if backend == "regime_aware": return RegimeAwareDonorImputer( condition_vars=condition_vars, target_vars=list(target_vars), n_estimators=self.config.donor_imputer_qrf_n_estimators, - nonnegative_vars=nonnegative_vars, seed=self.config.random_seed, ) zero_inflated_vars = ( @@ -5311,8 +5306,7 @@ def _build_donor_imputer( for variable, support_family in support_families.items() if support_family in { - VariableSupportFamily.ZERO_INFLATED_POSITIVE, - VariableSupportFamily.ZERO_INFLATED_SIGNED, + VariableSupportFamily.SUPPORT_SENSITIVE, } } if backend == "zi_qrf" @@ -7147,7 +7141,7 @@ def _rank_match_donor_values( donor_weight_array = donor_weights.to_numpy(dtype=float) donor_weight_array = np.clip(donor_weight_array, a_min=0.0, a_max=None) - if strategy is DonorMatchStrategy.ZERO_INFLATED_POSITIVE or ( + if ( strategy is DonorMatchStrategy.RANK and self._is_zero_inflated_positive_distribution(donor_array) ): diff --git a/src/microplex_us/variables.py b/src/microplex_us/variables.py index b3f35a2..3b41ce9 100644 --- a/src/microplex_us/variables.py +++ b/src/microplex_us/variables.py @@ -35,15 +35,13 @@ class DonorMatchStrategy(Enum): """How donor-generated scores should be mapped back onto donor support.""" RANK = "rank" - ZERO_INFLATED_POSITIVE = "zero_inflated_positive" class VariableSupportFamily(Enum): """Statistical support family for one variable.""" CONTINUOUS = "continuous" - ZERO_INFLATED_POSITIVE = "zero_inflated_positive" - ZERO_INFLATED_SIGNED = "zero_inflated_signed" + SUPPORT_SENSITIVE = "support_sensitive" BOUNDED_SHARE = "bounded_share" @@ -156,10 +154,7 @@ def is_redundant_given(self, variable_names: Iterable[str]) -> bool: @property def condition_score_mode(self) -> ConditionScoreMode: - if self.support_family in { - VariableSupportFamily.ZERO_INFLATED_POSITIVE, - VariableSupportFamily.ZERO_INFLATED_SIGNED, - }: + if self.support_family is VariableSupportFamily.SUPPORT_SENSITIVE: return ConditionScoreMode.VALUE_AND_SUPPORT return ConditionScoreMode.VALUE_ONLY @@ -258,12 +253,11 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, derived_from=( "qualified_dividend_income", "non_qualified_dividend_income", ), - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=( @@ -278,12 +272,11 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, derived_from=( "qualified_dividend_income", "non_qualified_dividend_income", ), - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, notes="Ordinary dividend totals are derived from the qualified and non-qualified atomic basis.", ), "qualified_dividend_income": VariableSemanticSpec( @@ -293,8 +286,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=( @@ -308,8 +300,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=( @@ -323,8 +314,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=( @@ -338,8 +328,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, ), @@ -350,8 +339,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=PUF_PENSION_CHALLENGER_SHARED_CONDITION_VARS, @@ -363,39 +351,33 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, ), "state_income_tax_paid": VariableSemanticSpec( native_entity=EntityType.TAX_UNIT, condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), "real_estate_tax_paid": VariableSemanticSpec( native_entity=EntityType.TAX_UNIT, condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), "mortgage_interest_paid": VariableSemanticSpec( native_entity=EntityType.TAX_UNIT, condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), "charitable_cash": VariableSemanticSpec( native_entity=EntityType.TAX_UNIT, condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), "charitable_noncash": VariableSemanticSpec( native_entity=EntityType.TAX_UNIT, condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), "student_loan_interest": VariableSemanticSpec( native_entity=EntityType.PERSON, @@ -404,15 +386,13 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, ), "ira_deduction": VariableSemanticSpec( native_entity=EntityType.TAX_UNIT, condition_entities=(EntityType.HOUSEHOLD, EntityType.TAX_UNIT), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), "health_savings_account_ald": VariableSemanticSpec( native_entity=EntityType.PERSON, @@ -421,8 +401,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, ), "self_employed_health_insurance_ald": VariableSemanticSpec( @@ -432,8 +411,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, ), "self_employed_pension_contribution_ald": VariableSemanticSpec( @@ -443,8 +421,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, ), "qualified_dividend_share": VariableSemanticSpec( @@ -459,8 +436,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_SIGNED, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=( @@ -513,8 +489,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS, notes=( "Positive rental-income support should track geography and property-like " @@ -528,8 +503,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=RENTAL_INCOME_COMPONENT_PREFERRED_CONDITION_VARS, notes=( "Rental-loss support should track geography and property-like " @@ -543,8 +517,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_SIGNED, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=PUF_IRS_TAX_PREFERRED_CONDITION_VARS, supplemental_shared_condition_vars=PUF_IRS_TAX_SUPPLEMENTAL_SHARED_CONDITION_VARS, challenger_shared_condition_vars=( @@ -553,23 +526,19 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: ), "has_medicaid": VariableSemanticSpec( projection_aggregation=ProjectionAggregation.MAX, - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, notes="Binary proxy for Medicaid participation on the CPS scaffold.", ), "public_assistance": VariableSemanticSpec( - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, notes="Public assistance amounts are sparse and should preserve support.", ), "ssi": VariableSemanticSpec( - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, notes="SSI amounts are sparse and should preserve support.", ), "social_security": VariableSemanticSpec( - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, notes="Reported Social Security amounts are sparse and support-sensitive.", ), "snap": VariableSemanticSpec( @@ -579,8 +548,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: EntityType.HOUSEHOLD, EntityType.SPM_UNIT, ), - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, ), } @@ -624,8 +592,7 @@ def minor_positive_employment_income_mask(frame: pd.DataFrame) -> pd.Series: for _scf_component_leaf in SCF_NET_WORTH_COMPONENT_LEAVES: VARIABLE_SEMANTIC_SPECS[_scf_component_leaf] = VariableSemanticSpec( native_entity=EntityType.PERSON, - support_family=VariableSupportFamily.ZERO_INFLATED_POSITIVE, - donor_match_strategy=DonorMatchStrategy.ZERO_INFLATED_POSITIVE, + support_family=VariableSupportFamily.SUPPORT_SENSITIVE, preferred_condition_vars=SCF_COMPONENT_PREFERRED_CONDITION_VARS, notes="SCF balance-sheet component leaf; positive magnitude.", ) @@ -833,10 +800,7 @@ def restore_dividend_components_from_composition(frame: pd.DataFrame) -> pd.Data ), model_variables=DIVIDEND_COMPOSITION_MODEL_COLUMNS, restored_variables=DIVIDEND_COMPONENT_COLUMNS, - match_strategies={ - "dividend_income": DonorMatchStrategy.ZERO_INFLATED_POSITIVE, - DIVIDEND_SHARE_COLUMN: DonorMatchStrategy.RANK, - }, + match_strategies={DIVIDEND_SHARE_COLUMN: DonorMatchStrategy.RANK}, prepare_frame=add_dividend_composition_features, restore_frame=restore_dividend_components_from_composition, ) @@ -884,7 +848,7 @@ def score_donor_condition_var( if not include_support: continue - support = (aligned["target"] > 0).astype(float) + support = (aligned["target"].abs() > 0).astype(float) if 0.0 < float(support.mean()) < 1.0: support_correlation = aligned["condition"].corr( support, diff --git a/tests/data_sources/test_scf_net_worth_components.py b/tests/data_sources/test_scf_net_worth_components.py index bd9fdd9..6149df4 100644 --- a/tests/data_sources/test_scf_net_worth_components.py +++ b/tests/data_sources/test_scf_net_worth_components.py @@ -395,7 +395,6 @@ def test_regime_aware_fit(self) -> None: condition_vars=_DONOR_CONDITION_VARS, target_vars=list(_DONOR_TARGET_VARS), n_estimators=60, - nonnegative_vars=set(_DONOR_TARGET_VARS), seed=42, ) imputer.fit(train) diff --git a/tests/pipelines/test_regime_aware_donor_imputer.py b/tests/pipelines/test_regime_aware_donor_imputer.py index 0649287..7d4bbe9 100644 --- a/tests/pipelines/test_regime_aware_donor_imputer.py +++ b/tests/pipelines/test_regime_aware_donor_imputer.py @@ -39,9 +39,7 @@ pytest.importorskip("microimpute.models.zero_inflated") -def _three_sign_frame_with_gap( - n: int = 1500, seed: int = 0 -) -> pd.DataFrame: +def _three_sign_frame_with_gap(n: int = 1500, seed: int = 0) -> pd.DataFrame: """Fixture with a hard gap between positive and negative training values. Positives live in [100, ∞), negatives in (-∞, -100], zeros exactly @@ -201,9 +199,15 @@ def test_same_seed_repeats_identically(self) -> None: ) imputer.fit(train) - first = imputer.generate(conditions, seed=123)["short_term_capital_gains"].to_numpy() - second = imputer.generate(conditions, seed=123)["short_term_capital_gains"].to_numpy() - third = imputer.generate(conditions, seed=999)["short_term_capital_gains"].to_numpy() + first = imputer.generate(conditions, seed=123)[ + "short_term_capital_gains" + ].to_numpy() + second = imputer.generate(conditions, seed=123)[ + "short_term_capital_gains" + ].to_numpy() + third = imputer.generate(conditions, seed=999)[ + "short_term_capital_gains" + ].to_numpy() np.testing.assert_array_equal(first, second) assert not np.array_equal(first, third) @@ -226,5 +230,9 @@ def test_same_seed_repeats_identically_for_multiple_targets(self) -> None: third = imputer.generate(conditions, seed=654) for column in ("short_term_capital_gains", "rental_income"): - np.testing.assert_array_equal(first[column].to_numpy(), second[column].to_numpy()) - assert not np.array_equal(first[column].to_numpy(), third[column].to_numpy()) + np.testing.assert_array_equal( + first[column].to_numpy(), second[column].to_numpy() + ) + assert not np.array_equal( + first[column].to_numpy(), third[column].to_numpy() + ) diff --git a/tests/pipelines/test_us.py b/tests/pipelines/test_us.py index ef6411f..2ac3254 100644 --- a/tests/pipelines/test_us.py +++ b/tests/pipelines/test_us.py @@ -3378,7 +3378,7 @@ def generate(self, frame, seed=None): assert captured["init_kwargs"]["n_estimators"] == 77 assert captured["init_kwargs"]["zero_threshold"] == 0.1 assert captured["init_kwargs"]["zero_inflated_vars"] == {"public_assistance"} - assert captured["init_kwargs"]["nonnegative_vars"] == {"public_assistance"} + assert captured["init_kwargs"]["nonnegative_vars"] == set() assert "weight" in captured["fit_columns"] assert captured["fit_kwargs"]["weight_col"] == "weight" assert set(integration["seed_data"]["public_assistance"].tolist()) <= { @@ -3386,7 +3386,7 @@ def generate(self, frame, seed=None): 200.0, } - def test_signed_zero_inflated_donor_vars_are_not_clamped(self, monkeypatch): + def test_support_sensitive_donor_vars_do_not_force_clamps(self, monkeypatch): captured: dict[str, dict[str, object]] = {} class FakeRegimeAwareDonorImputer: @@ -3430,8 +3430,8 @@ def __init__(self, **kwargs): target_vars=target_vars, ) - assert captured["regime_aware"]["nonnegative_vars"] == {"public_assistance"} - assert captured["zi_qrf"]["nonnegative_vars"] == {"public_assistance"} + assert "nonnegative_vars" not in captured["regime_aware"] + assert captured["zi_qrf"]["nonnegative_vars"] == set() assert captured["zi_qrf"]["zero_inflated_vars"] == { "partnership_s_corp_income", "public_assistance", diff --git a/tests/pipelines/test_zi_qrf_backend.py b/tests/pipelines/test_zi_qrf_backend.py index 47b85e0..f717021 100644 --- a/tests/pipelines/test_zi_qrf_backend.py +++ b/tests/pipelines/test_zi_qrf_backend.py @@ -11,7 +11,7 @@ guarantees v8 relies on: 1. The factory (`_build_donor_imputer`) populates `zero_inflated_vars` - from the `VariableSupportFamily.ZERO_INFLATED_POSITIVE` variables + from the `VariableSupportFamily.SUPPORT_SENSITIVE` variables when `backend == "zi_qrf"`, and leaves it empty otherwise. 2. `ColumnwiseQRFDonorImputer.fit` trains a `RandomForestClassifier` zero-gate on each whitelisted column whose observed zero fraction @@ -151,16 +151,14 @@ def spy_predict(x_values: np.ndarray, **kwargs): class TestBuildDonorImputerFactory: """The pipeline factory wires zero_inflated_vars only when backend='zi_qrf'.""" - def _factory( - self, backend: str - ) -> ColumnwiseQRFDonorImputer: + def _factory(self, backend: str) -> ColumnwiseQRFDonorImputer: config = USMicroplexBuildConfig( donor_imputer_backend=backend, donor_imputer_qrf_n_estimators=25, ) pipeline = USMicroplexPipeline(config=config) # Variables chosen to span support families: - # qualified_dividend_income, taxable_interest_income → ZERO_INFLATED_POSITIVE + # qualified_dividend_income, taxable_interest_income → SUPPORT_SENSITIVE # age → BOUNDED_INTEGER # These are all real PolicyEngine-US variable names with explicit # semantic specs in microplex_us.variables. diff --git a/tests/test_variables.py b/tests/test_variables.py index e67bb9c..d39eb02 100644 --- a/tests/test_variables.py +++ b/tests/test_variables.py @@ -160,10 +160,7 @@ def test_donor_imputation_block_specs_include_match_strategies_and_restored_vari "qualified_dividend_income", "non_qualified_dividend_income", ) - assert ( - specs[0].strategy_for("dividend_income") - is DonorMatchStrategy.ZERO_INFLATED_POSITIVE - ) + assert specs[0].strategy_for("dividend_income") is DonorMatchStrategy.RANK assert specs[0].native_entity is EntityType.PERSON assert specs[0].condition_entities == ( EntityType.PERSON, @@ -178,10 +175,7 @@ def test_donor_imputation_block_specs_include_match_strategies_and_restored_vari EntityType.HOUSEHOLD, EntityType.TAX_UNIT, ) - assert ( - specs[1].strategy_for("taxable_interest_income") - is DonorMatchStrategy.ZERO_INFLATED_POSITIVE - ) + assert specs[1].strategy_for("taxable_interest_income") is DonorMatchStrategy.RANK def test_donor_imputation_block_specs_use_zero_inflated_matching_for_sparse_irs_amounts(): @@ -208,14 +202,14 @@ def test_donor_imputation_block_specs_use_zero_inflated_matching_for_sparse_irs_ ) assert ( by_variable[variable_name].strategy_for(variable_name) - is DonorMatchStrategy.ZERO_INFLATED_POSITIVE + is DonorMatchStrategy.RANK ) assert by_variable["partnership_s_corp_income"].native_entity is EntityType.PERSON assert ( by_variable["partnership_s_corp_income"].strategy_for( "partnership_s_corp_income" ) - is DonorMatchStrategy.ZERO_INFLATED_POSITIVE + is DonorMatchStrategy.RANK ) @@ -312,17 +306,15 @@ def test_state_program_proxy_semantics_are_registered(): from microplex_us.variables import variable_semantic_spec_for has_medicaid = variable_semantic_spec_for("has_medicaid") - assert has_medicaid.support_family is VariableSupportFamily.ZERO_INFLATED_POSITIVE - assert ( - has_medicaid.donor_match_strategy is DonorMatchStrategy.ZERO_INFLATED_POSITIVE - ) + assert has_medicaid.support_family is VariableSupportFamily.SUPPORT_SENSITIVE + assert has_medicaid.donor_match_strategy is DonorMatchStrategy.RANK assert has_medicaid.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT assert has_medicaid.projection_aggregation is ProjectionAggregation.MAX for variable_name in ("public_assistance", "ssi", "social_security"): spec = variable_semantic_spec_for(variable_name) - assert spec.support_family is VariableSupportFamily.ZERO_INFLATED_POSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.ZERO_INFLATED_POSITIVE + assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE + assert spec.donor_match_strategy is DonorMatchStrategy.RANK def test_sparse_irs_ald_semantics_are_registered(): @@ -335,8 +327,8 @@ def test_sparse_irs_ald_semantics_are_registered(): ): spec = variable_semantic_spec_for(variable_name) assert spec.native_entity is EntityType.PERSON - assert spec.support_family is VariableSupportFamily.ZERO_INFLATED_POSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.ZERO_INFLATED_POSITIVE + assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE + assert spec.donor_match_strategy is DonorMatchStrategy.RANK assert spec.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT @@ -345,8 +337,8 @@ def test_partnership_income_semantics_remain_person_native(): spec = variable_semantic_spec_for("partnership_s_corp_income") assert spec.native_entity is EntityType.PERSON - assert spec.support_family is VariableSupportFamily.ZERO_INFLATED_SIGNED - assert spec.donor_match_strategy is DonorMatchStrategy.ZERO_INFLATED_POSITIVE + assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE + assert spec.donor_match_strategy is DonorMatchStrategy.RANK assert spec.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT @@ -419,8 +411,8 @@ def test_rental_income_components_use_sparse_asset_conditioning(): for variable_name in ("rental_income_positive", "rental_income_negative"): spec = variable_semantic_spec_for(variable_name) - assert spec.support_family is VariableSupportFamily.ZERO_INFLATED_POSITIVE - assert spec.donor_match_strategy is DonorMatchStrategy.ZERO_INFLATED_POSITIVE + assert spec.support_family is VariableSupportFamily.SUPPORT_SENSITIVE + assert spec.donor_match_strategy is DonorMatchStrategy.RANK assert spec.condition_score_mode is ConditionScoreMode.VALUE_AND_SUPPORT assert ( spec.preferred_condition_vars diff --git a/uv.lock b/uv.lock index 8dc040d..c5fbec4 100644 --- a/uv.lock +++ b/uv.lock @@ -1124,8 +1124,8 @@ wheels = [ [[package]] name = "microimpute" -version = "1.15.1" -source = { registry = "https://pypi.org/simple" } +version = "2.1.1" +source = { git = "https://github.com/PolicyEngine/microimpute.git?rev=27d23090dcf04c2c30ae369b0a209e27eb3659f0#27d23090dcf04c2c30ae369b0a209e27eb3659f0" } dependencies = [ { name = "joblib" }, { name = "numpy" }, @@ -1141,10 +1141,6 @@ dependencies = [ { name = "statsmodels" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/97/17/d621d4ed40e0afac6f1a2c4dea423783576613820d1460ae30d65c48309e/microimpute-1.15.1.tar.gz", hash = "sha256:af409525d475efeb8c8526e9630834c4f16563e15cd42665117d2a1397fcf404", size = 128669, upload-time = "2026-03-09T15:59:33.885Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/42/f1/1d80dbb8cc9e85962524a4233cfe42ac1a78e6f2cc0ca479ed1817f6d8ae/microimpute-1.15.1-py3-none-any.whl", hash = "sha256:f5f2de91eeedea28ddae42d42757b558d6eb85c1a1fd6a9097b53e309f19369c", size = 111313, upload-time = "2026-03-09T15:59:32.553Z" }, -] [[package]] name = "microplex" @@ -1207,7 +1203,7 @@ requires-dist = [ { name = "duckdb", specifier = ">=1.2" }, { name = "h5py", specifier = ">=3.10" }, { name = "jupyter-book", marker = "extra == 'docs'", specifier = ">=0.15,<0.16" }, - { name = "microimpute", marker = "python_full_version >= '3.12' and python_full_version < '3.15' and extra == 'policyengine'", specifier = "==1.15.1" }, + { name = "microimpute", marker = "python_full_version >= '3.12' and python_full_version < '3.15' and extra == 'policyengine'", git = "https://github.com/PolicyEngine/microimpute.git?rev=27d23090dcf04c2c30ae369b0a209e27eb3659f0" }, { name = "microplex", extras = ["calibrate"], git = "https://github.com/PolicyEngine/microplex.git?rev=1e0627182f9df40aacd7043c96956c2895bf9d30" }, { name = "microunit", marker = "extra == 'policyengine'", specifier = ">=0.1.0" }, { name = "policyengine-us", marker = "python_full_version >= '3.11' and python_full_version < '3.15' and extra == 'policyengine'", specifier = "==1.715.2" },