diff --git a/changelog.d/705.fixed b/changelog.d/705.fixed new file mode 100644 index 000000000..4c60c9bff --- /dev/null +++ b/changelog.d/705.fixed @@ -0,0 +1 @@ +Use Census work-and-childcare capping inputs for clone-half SPM childcare expenses instead of donor capping shares. diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 582bf428a..0f378075b 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -286,6 +286,7 @@ class CensusCPS_2018(CensusCPS): "A_FNLWGT", "A_LINENO", "A_SPOUSE", + "PERRP", "A_AGE", "A_SEX", "PEDISEYE", diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 1d05e2ca6..1244be4e7 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -90,6 +90,15 @@ ), } +# Census CPS ASEC 2024 technical documentation, PERRP: +# https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf +PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = { + 43: "Opposite Sex Unmarried Partner with Relatives", + 44: "Opposite Sex Unmarried Partner without Relatives", + 46: "Same Sex Unmarried Partner with Relatives", + 47: "Same Sex Unmarried Partner without Relatives", +} + class CPS(Dataset): name = "cps" @@ -572,6 +581,14 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 + perrp = ( + person.PERRP + if "PERRP" in person + else pd.Series(0, index=person.index, dtype=np.int16) + ) + cps["is_unmarried_partner_of_household_head"] = perrp.isin( + PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES.keys() + ) # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 @@ -582,6 +599,10 @@ def children_per_parent(col: str) -> pd.DataFrame: add_overtime_occupation(cps, person) +def derive_weeks_worked(weeks_worked: Series | np.ndarray) -> Series | np.ndarray: + return np.clip(weeks_worked, 0, 52) + + def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): """Add income variables. @@ -607,6 +628,7 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): cps["weekly_hours_worked"] = person.HRSWK cps["hours_worked_last_week"] = person.A_HRS1 + cps["weeks_worked"] = derive_weeks_worked(person.WKSWORK) cps["taxable_interest_income"] = person.INT_VAL * (p["taxable_interest_fraction"]) cps["tax_exempt_interest_income"] = person.INT_VAL * ( diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index e5208d630..670b6b858 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -584,125 +584,6 @@ def reconcile_ss_subcomponents(predictions, total_ss): } -def derive_clone_capped_childcare_expenses( - donor_pre_subsidy: np.ndarray, - donor_capped: np.ndarray, - clone_pre_subsidy: np.ndarray, - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, -) -> np.ndarray: - """Derive clone-half capped childcare from clone inputs. - - The CPS provides both pre-subsidy childcare and the SPM-specific - capped childcare deduction. For the clone half, we impute only the - pre-subsidy amount, then deterministically rebuild the capped amount - instead of letting a second QRF predict it independently. - - We preserve the donor's observed capping share while also respecting - the clone's own earnings cap. This keeps the clone-half value - consistent with pre-subsidy childcare and avoids impossible outputs - such as capped childcare exceeding pre-subsidy childcare. - """ - - donor_pre_subsidy = np.asarray(donor_pre_subsidy, dtype=float) - donor_capped = np.asarray(donor_capped, dtype=float) - clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float) - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - - donor_cap_share = np.divide( - donor_capped, - donor_pre_subsidy, - out=np.zeros_like(donor_capped, dtype=float), - where=donor_pre_subsidy > 0, - ) - donor_cap_share = np.clip(donor_cap_share, 0.0, 1.0) - capped_from_share = np.maximum(clone_pre_subsidy, 0.0) * donor_cap_share - - if clone_person_data.empty: - earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float) - else: - eligible = clone_person_data["is_parent_proxy"].astype(bool) - parent_rows = clone_person_data.loc[ - eligible, ["spm_unit_id", "age", "earnings"] - ].copy() - if parent_rows.empty: - earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float) - else: - parent_rows["earnings"] = parent_rows["earnings"].clip(lower=0.0) - parent_rows["age_rank"] = parent_rows.groupby("spm_unit_id")["age"].rank( - method="first", ascending=False - ) - top_two = parent_rows[parent_rows["age_rank"] <= 2].sort_values( - ["spm_unit_id", "age_rank"] - ) - earnings_cap_by_unit = top_two.groupby("spm_unit_id")["earnings"].agg( - lambda values: ( - float(values.iloc[0]) - if len(values) == 1 - else float(np.minimum(values.iloc[0], values.iloc[1])) - ) - ) - earnings_cap = earnings_cap_by_unit.reindex( - clone_spm_unit_ids, fill_value=0.0 - ).to_numpy(dtype=float) - - return np.minimum(capped_from_share, earnings_cap) - - -def _rebuild_clone_capped_childcare_expenses( - data: dict, - time_period: int, - cps_sim, -) -> np.ndarray: - """Rebuild clone-half capped childcare expenses after stage-2 imputation.""" - - n_persons_half = len(data["person_id"][time_period]) // 2 - n_spm_units_half = len(data["spm_unit_id"][time_period]) // 2 - - person_roles = cps_sim.calculate_dataframe( - ["age", "is_tax_unit_head", "is_tax_unit_spouse"] - ) - if len(person_roles) != n_persons_half: - raise ValueError( - "Unexpected person role frame length while rebuilding clone childcare " - f"expenses: got {len(person_roles)}, expected {n_persons_half}" - ) - - clone_person_data = pd.DataFrame( - { - "spm_unit_id": data["person_spm_unit_id"][time_period][n_persons_half:], - "age": person_roles["age"].values, - "is_parent_proxy": ( - person_roles["is_tax_unit_head"].values - | person_roles["is_tax_unit_spouse"].values - ), - "earnings": ( - data["employment_income"][time_period][n_persons_half:] - + data["self_employment_income"][time_period][n_persons_half:] - ), - } - ) - - donor_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ - :n_spm_units_half - ] - donor_capped = data["spm_unit_capped_work_childcare_expenses"][time_period][ - :n_spm_units_half - ] - clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ - n_spm_units_half: - ] - clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:] - - return derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - ) - - def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS] @@ -807,24 +688,6 @@ def _splice_cps_only_predictions( new_values = np.concatenate([cps_half, pred_values]) data[var] = {time_period: new_values} - if ( - "spm_unit_capped_work_childcare_expenses" in data - and "spm_unit_pre_subsidy_childcare_expenses" in data - ): - n_half = entity_half_lengths.get( - "spm_unit", - len(data["spm_unit_capped_work_childcare_expenses"][time_period]) // 2, - ) - cps_half = data["spm_unit_capped_work_childcare_expenses"][time_period][:n_half] - clone_half = _rebuild_clone_capped_childcare_expenses( - data=data, - time_period=time_period, - cps_sim=cps_sim, - ) - data["spm_unit_capped_work_childcare_expenses"] = { - time_period: np.concatenate([cps_half, clone_half]) - } - del cps_sim return data diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index 646878760..0d481a2b4 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -24,7 +24,6 @@ _derive_overtime_occupation_inputs, _impute_clone_cps_features, apply_retirement_constraints, - derive_clone_capped_childcare_expenses, reconcile_ss_subcomponents, ) from policyengine_us_data.datasets.cps.tipped_occupation import ( @@ -132,86 +131,12 @@ def test_pension_income_not_in_cps_only(self): ) def test_capped_childcare_not_in_cps_only(self): - """Capped childcare should be derived from clone-half inputs, not - independently QRF-imputed.""" + """Capped childcare should not be independently QRF-imputed.""" assert "spm_unit_capped_work_childcare_expenses" not in set( CPS_ONLY_IMPUTED_VARIABLES ) -class TestCloneChildcareDerivation: - """Clone-half capped childcare should be derived deterministically.""" - - def test_caps_at_pre_subsidy_and_clone_earnings(self): - donor_pre_subsidy = np.array([10000.0, 4000.0, 6000.0]) - donor_capped = np.array([4000.0, 4000.0, 0.0]) - clone_pre_subsidy = np.array([12000.0, 5000.0, 3000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [1, 1, 2, 2, 3], - "age": [40, 38, 35, 33, 29], - "is_parent_proxy": [True, True, True, True, True], - "earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([1, 2, 3]), - ) - - np.testing.assert_allclose(result, np.array([3000.0, 0.0, 0.0])) - - def test_uses_single_parent_earnings_cap_for_single_proxy_units(self): - donor_pre_subsidy = np.array([4000.0]) - donor_capped = np.array([4000.0]) - clone_pre_subsidy = np.array([6000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [10], - "age": [31], - "is_parent_proxy": [True], - "earnings": [2500.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([10]), - ) - - np.testing.assert_allclose(result, np.array([2500.0])) - - def test_falls_back_to_zero_without_parent_proxies(self): - donor_pre_subsidy = np.array([3000.0]) - donor_capped = np.array([2000.0]) - clone_pre_subsidy = np.array([3000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [20, 20], - "age": [12, 9], - "is_parent_proxy": [False, False], - "earnings": [0.0, 0.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([20]), - ) - - np.testing.assert_allclose(result, np.array([0.0])) - - class TestRetirementConstraints: """Post-processing retirement constraints enforce IRS caps.""" diff --git a/tests/unit/test_reference_partner.py b/tests/unit/test_reference_partner.py new file mode 100644 index 000000000..7d18ce25b --- /dev/null +++ b/tests/unit/test_reference_partner.py @@ -0,0 +1,70 @@ +""" +Tests for reference-person partner extraction from CPS ASEC. + +The public CPS ASEC relationship-to-reference-person variable PERRP identifies +unmarried partners of the household head/reference person. We carry that +through so the SPM childcare cap can distinguish the reference person's partner +from unrelated adults in the same SPM unit. +""" + +import numpy as np +import pandas as pd + +from policyengine_us_data.datasets.cps.census_cps import PERSON_COLUMNS +from policyengine_us_data.datasets.cps.cps import ( + PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES, + add_personal_variables, +) + + +def _person_frame(**columns): + n_persons = len(next(iter(columns.values()))) + data = {column: np.zeros(n_persons, dtype=int) for column in PERSON_COLUMNS} + data.update(columns) + return pd.DataFrame(data) + + +class TestReferencePartner: + """Test suite for CPS relationship-to-reference-person extraction.""" + + def test_census_cps_loads_perrp(self): + assert "PERRP" in PERSON_COLUMNS + + def test_unmarried_partner_perrp_code_table_matches_census_labels(self): + assert PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES == { + 43: "Opposite Sex Unmarried Partner with Relatives", + 44: "Opposite Sex Unmarried Partner without Relatives", + 46: "Same Sex Unmarried Partner with Relatives", + 47: "Same Sex Unmarried Partner without Relatives", + } + + def test_cps_maps_unmarried_partner_from_perrp(self): + person = _person_frame( + PH_SEQ=np.arange(7) + 1, + A_LINENO=np.ones(7), + A_AGE=np.full(7, 35), + PERRP=np.array([40, 43, 44, 45, 46, 47, 48]), + ) + + cps = {} + add_personal_variables(cps, person) + + np.testing.assert_array_equal( + cps["is_unmarried_partner_of_household_head"], + np.array([False, True, True, False, True, True, False]), + ) + + def test_missing_perrp_defaults_to_false(self): + person = _person_frame( + PH_SEQ=np.arange(3) + 1, + A_LINENO=np.ones(3), + A_AGE=np.full(3, 35), + ).drop(columns="PERRP") + + cps = {} + add_personal_variables(cps, person) + + np.testing.assert_array_equal( + cps["is_unmarried_partner_of_household_head"], + np.array([False, False, False]), + ) diff --git a/tests/unit/test_weeks_worked.py b/tests/unit/test_weeks_worked.py new file mode 100644 index 000000000..7f1bc6959 --- /dev/null +++ b/tests/unit/test_weeks_worked.py @@ -0,0 +1,24 @@ +""" +Tests for weeks_worked extraction from CPS ASEC. + +The Census CPS ASEC exposes WKSWORK directly, which we now carry through as +the model input for future-year SPM work-expense calculations. +""" + +import numpy as np + +from policyengine_us_data.datasets.cps.census_cps import PERSON_COLUMNS +from policyengine_us_data.datasets.cps.cps import derive_weeks_worked + + +class TestWeeksWorked: + """Test suite for weeks_worked variable extraction.""" + + def test_census_cps_loads_wkswork(self): + assert "WKSWORK" in PERSON_COLUMNS + + def test_cps_derives_weeks_worked_from_wkswork(self): + np.testing.assert_array_equal( + derive_weeks_worked(np.array([-4, 0, 1, 26, 52, 60])), + np.array([0, 0, 1, 26, 52, 52]), + )