Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9a4f8a4
Update unit tests for new 2024 data and functions
jeancochrane Mar 25, 2026
f05e905
Update sample tax bills for 2024
jeancochrane Mar 26, 2026
ef48dc8
Fix a few final lines of `test-tax_bill.R`
jeancochrane Mar 27, 2026
28caaab
Add to vetdis calc in tests
jeancochrane Mar 30, 2026
dadbe77
Merge branch '2024-data-update' into jeancochrane/72-update-unit-test…
jeancochrane Apr 1, 2026
b0684df
Merge branch 'jeancochrane/72-update-unit-tests-to-include-2024-data-…
jeancochrane Apr 1, 2026
dac3bb4
Further updates to sample tax bills, and test contents of sample_tax_…
jeancochrane Apr 2, 2026
cf2a2a0
Update `tax_bill` test to account for fund-level reporting differenci…
jeancochrane Apr 6, 2026
e0dcd36
Update `test-tax_bill.R` to finish 2024 sample bill tests
jeancochrane Apr 14, 2026
ea1c91c
Make sure `test-sample_tax_bills_summary.R` always runs relative to t…
jeancochrane Apr 14, 2026
3f707fc
Split agency lookup snapshot tests out into dedicated test
jeancochrane Apr 14, 2026
0746750
Merge branch '2024-data-update' into jeancochrane/update-sample-tax-b…
jeancochrane Apr 14, 2026
d636e00
Update `lookup_agency` snapshot tests for 2024 data
jeancochrane Apr 14, 2026
b22a117
Fail loudly if `sample_tax_bills` directory doesn't exist in tests
jeancochrane Apr 14, 2026
e984b74
Update `test-sample_tax_bills_summary.R` so that tests can run on bui…
jeancochrane Apr 15, 2026
cc0752c
Small tweak and add docs to `tax_bill` comparison test
jeancochrane Apr 15, 2026
6df55bb
Add some uncertainty to language about future agencies in `sample_tax…
jeancochrane Apr 22, 2026
ad73e8b
Clarify the way `name_priority` works for transit TIFs in `sample_tax…
jeancochrane Apr 22, 2026
7b3104d
Better comments on `lookup_agency()` snapshot tests, plus make them f…
jeancochrane Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
- id: file-contents-sorter
files: '^\.Rbuildignore$'
- id: end-of-file-fixer
exclude: '\.Rd|\.csv'
exclude: '(\.Rd|\.csv|^tests/testthat/_snaps/.*\.md$)'
Comment thread
jeancochrane marked this conversation as resolved.
- id: mixed-line-ending
args: ['--fix=no']
- repo: local
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
4 changes: 2 additions & 2 deletions data-raw/sample_tax_bills/agency_name_match.csv
Git LFS file not shown
72 changes: 56 additions & 16 deletions data-raw/sample_tax_bills/sample_tax_bills_detail.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@ library(data.table)
# To do this, we first scan the PDFs with tabulizer, match the output tables
# with the output from tax_bill(), then clean up

# Get a list of all PDFs in sample_tax_bills/
# Get a list of all PDFs in the sample_tax_bills directory.
# We curate this set of files manually. When adding a new tax year, you should
# download a new set of sample tax bills for that year from the Treasurer's tax
# bill portal, with an eye toward decent coverage of classes, geographic
# areas, and exemptions. For an idea of what decent coverage looks like, refer
# to prior years of sample bills
list_pdf_inputs <- list.files(
path = "data-raw/sample_tax_bills",
pattern = "*.pdf",
Expand All @@ -23,10 +28,8 @@ row_to_names <- function(df) {
df[-1, ]
}


# Different tax bills can have different table sizes depending on the number of
# taxing district.

extract_tax_bill <- function(file) {
base_file <- basename(file)
tbl <- pdf_text(file) %>%
Expand Down Expand Up @@ -64,7 +67,8 @@ extract_tax_bill <- function(file) {
paste0(
"TAXES|Assess|Property|EAV|Local Tax|",
"Total Tax|Do not|Equalizer|cookcountyclerk.com|",
"Pursuant|meaning of|If paying later|\\d{15}+|By \\d{2}/"
"Pursuant|meaning of|If paying later|\\d{15}+|By \\d{2}/|",
"Visit COOKCOUNTYCLERKIL.GOV"
Comment thread
jeancochrane marked this conversation as resolved.
)
)
)
Expand All @@ -79,7 +83,6 @@ extract_tax_bill <- function(file) {
return(out)
}


# Collect all scanned tables + meta data in a data frame
bills <- map(list_pdf_inputs, extract_tax_bill)
bills_df <- bind_rows(bills)
Expand All @@ -94,28 +97,65 @@ bills_df <- bills_df %>%
filter(!stringr::str_detect(agency_name, " Total")) %>%
mutate(across(c(year, final_tax:prev_tax), readr::parse_number))

# Load agency name lookup from file
# Load agency name lookup from file. This lookup maps agency names to numbers.
#
# We maintain this file by hand. When adding a new year of sample bills, you
# may encounter agencies that are not yet present in this list. To
# add those agencies, perform the join to `bills_df` in the code block below
# and then filter the resulting `bills_df` dataframe for agencies with nulls
# for `agency_num`. Then, use the Clerk's agency rate report and TIF report
# for that year to find the agency number for each missing agency.
#
# The `agency_name_match.csv` file has the following schema:
#
# 1. agency_name: The name of the agency, exactly matching its representation
# on the tax bill. If the same agency has different name representations
# on tax bills across years, usually because the Clerk and Treasurer have
# changed the format of the name for that agency, then you should add a
# duplicate row for the agency with the new name but with identical values
# for the rest of the fields.
# - If an agency fund or transit TIF distribution has its own line item
# on the tax bill, you should create a new row for that line item in
# this crosswalk, but make sure to give it a lower priority in the
# `num_priority` column (see the docs for that column below). That way
# the code below will ensure we roll up those fund values into the
# overall values for the agency.
#
# 2. agency_num: The Clerk's unique identifier for the agency. You can find
# this value in the Clerk's agency rate report and/or TIF report.
#
# 3. name_priority: Integer priority for this name, in descending priority
# order (e.g. 1 is higher priority than 2). We only use this value when
# either A) rolling up a fund into its parent agency, or B) rolling up
# a transit TIF distribution into the TIF agency. In these cases, it's
# important that the parent agency (or the TIF agency) have priority 1.
# All other agencies should have priorities greater than 1, but it doesn't
# matter what those priorities are relative to each other, since we only
# use the `name_priority` field to determine which agency is the parent
# (i.e. the agency in an `agency_num` group whose `name_priority` is 1).
agency_match <- readr::read_csv(
"data-raw/sample_tax_bills/agency_name_match.csv"
)

# Join agency ID numbers to bills table
bills_df <- bills_df %>%
left_join(agency_match, by = "agency_name") %>%
# Agency name join key should be case insensitive, since case can change
# across years
mutate(agency_name_lower = str_to_lower(agency_name)) %>%
left_join(
agency_match %>%
mutate(agency_name_lower = str_to_lower(agency_name)) %>%
select(-agency_name),
by = "agency_name_lower"
) %>%
select(-agency_name_lower) %>%
relocate(agency_num, .before = "agency_name")

# Consolidate Cook County and TIF breakouts into single line-item
# Consolidate funds and TIF breakouts into a single line-item for the parent
# agency
bills_df <- bills_df %>%
mutate(cook = str_detect(
agency_name,
"Cook County Public Safety|Cook County Health Facilities|County of Cook"
)) %>%
group_by(pin, year, cook) %>%
mutate(across(final_tax:prev_tax, ~ ifelse(cook, sum(.x), .x))) %>%
ungroup() %>%
Comment thread
jeancochrane marked this conversation as resolved.
group_by(pin, year, agency_num) %>%
mutate(across(final_tax:prev_tax, sum)) %>%
select(-cook) %>%
filter(!is.na(agency_num), name_priority == 1) %>%
select(-name_priority) %>%
ungroup()
Expand Down
4 changes: 2 additions & 2 deletions data-raw/sample_tax_bills/sample_tax_bills_detail.csv
Git LFS file not shown
7 changes: 6 additions & 1 deletion data-raw/sample_tax_bills/sample_tax_bills_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ library(stringr)
library(readr)
library(data.table)

# Load sample tax bills summary data from file
# Load sample tax bills summary data from file.
#
# We maintain this file by hand, and we need to update it whenever
# we add new tax bills to `data-raw/sample_tax_bills/` each year. In the
# future, it might be nice to script a process for extracting these summaries
# automatically
sample_tax_bills_summary <- readr::read_csv(
"data-raw/sample_tax_bills/sample_tax_bills_summary.csv",
col_types = cols(pin = "c", tax_code = "c", class = "c")
Expand Down
4 changes: 2 additions & 2 deletions data-raw/sample_tax_bills/sample_tax_bills_summary.csv
Git LFS file not shown
Binary file modified data/sample_tax_bills_detail.rda
Binary file not shown.
Binary file modified data/sample_tax_bills_summary.rda
Binary file not shown.
3 changes: 2 additions & 1 deletion tests/testthat/_snaps/lookup_agency_over_time/lookup.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# lookup values/data are correct
# agency lookup matches snapshots
Comment thread
jeancochrane marked this conversation as resolved.

{
"type": "list",
Expand Down Expand Up @@ -68,3 +68,4 @@
}
]
}

Comment thread
jeancochrane marked this conversation as resolved.
21 changes: 11 additions & 10 deletions tests/testthat/_snaps/lookup_agency_summary/lookup.md

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion tests/testthat/test-lookup.R
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,12 @@ test_that("lookup values/data are correct", {
3178945619, 286280738, 0, 638172798
)
)
})

# These snapshot tests check to make sure we haven't done something obviously
# wrong to mess up the `lookup_agency()` function. We do that by testing the
# function on a sample of real-world data
test_that("agency lookup matches snapshots", {
Comment thread
jeancochrane marked this conversation as resolved.
local_edition(3) # Enable snapshot testing
expect_snapshot_value(
# Dataframe is necessary for json serialization in expect_snapshot_value,
Expand All @@ -297,8 +302,13 @@ test_that("lookup values/data are correct", {
style = "json2",
variant = "lookup_agency_over_time"
)
# Extract a sample of years from the sample tax bills to test. We need to
# restrict the set of years before we test, otherwise the subsequent
# snapshot test will fail every time we add a new year of data
sum_df_2018_to_2024 <- sum_df %>% filter(year >= 2018, year <= 2024)
expect_snapshot_value(
as.data.frame(lookup_agency(sum_df$year, sum_df$tax_code)),
lookup_agency(sum_df_2018_to_2024$year, sum_df_2018_to_2024$tax_code) %>%
as.data.frame(),
style = "json2",
variant = "lookup_agency_summary"
)
Expand Down
36 changes: 36 additions & 0 deletions tests/testthat/test-sample_tax_bills_summary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
context("Test sample_tax_bills_summary")
Comment thread
jeancochrane marked this conversation as resolved.

##### TEST sample_tax_bills_summary #####

library(dplyr)

ptaxsim_db_conn <- DBI::dbConnect(
RSQLite::SQLite(),
Sys.getenv("PTAXSIM_DB_PATH")
)
assign("ptaxsim_db_conn", ptaxsim_db_conn, envir = .GlobalEnv)

# Test to make sure we have added sample tax bills for the latest year
test_that("max data year matches max year for sample tax bills", {
max_year <- DBI::dbGetQuery(
ptaxsim_db_conn,
"SELECT data_year_max FROM metadata"
) %>%
pull()

expect_equal(max(sample_tax_bills_summary$year), max_year)
expect_equal(max(sample_tax_bills_detail$year), max_year)
})

# Test to make sure we have updated both the summary and detail sample bills
test_that("sample_tax_bills_summary PIN-years match sample_tax_bills_detail", {
summary_pin_years <- sample_tax_bills_summary %>%
distinct(year, pin) %>%
arrange(year, pin)

detail_pin_years <- sample_tax_bills_detail %>%
distinct(year, pin) %>%
arrange(year, pin)

expect_equal(summary_pin_years, detail_pin_years, ignore_attr = TRUE)
})
Loading
Loading