From ac86b9e887ef99fd1870a0ee53deb224dc14ed04 Mon Sep 17 00:00:00 2001 From: Marc LeBlanc <7050295+marcleblanc2@users.noreply.github.com> Date: Fri, 26 Jun 2026 00:44:57 -0600 Subject: [PATCH] Add Unicode scan CI gate Amp-Thread-ID: https://ampcode.com/threads/T-019f029c-24d2-71ff-a2f0-cba4d6a90d63 Co-authored-by: Amp --- .github/workflows/validate.yml | 3 ++ AGENTS.md | 1 + src/src_py_lib/utils/logging.py | 2 +- tests/test_tsv.py | 4 +- tests/test_unicode_scan.py | 22 +++++++++++ tests/unicode_scan.py | 65 +++++++++++++++++++++++++++++++++ 6 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 tests/test_unicode_scan.py create mode 100644 tests/unicode_scan.py diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 58c46c7..e08cde1 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -181,6 +181,9 @@ jobs: - name: Validate lockfile run: uv lock --check + - name: Scan for Unicode characters + run: uv run --frozen python tests/unicode_scan.py + - name: Lint Python run: uv run --frozen ruff check . diff --git a/AGENTS.md b/AGENTS.md index b609a79..207b4db 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,6 +17,7 @@ actionlint npx --yes markdownlint-cli2@0.22.1 uv sync +uv run python tests/unicode_scan.py uv run ruff format . uv run ruff check . uv run pyright diff --git a/src/src_py_lib/utils/logging.py b/src/src_py_lib/utils/logging.py index 323ab56..a09638f 100644 --- a/src/src_py_lib/utils/logging.py +++ b/src/src_py_lib/utils/logging.py @@ -378,7 +378,7 @@ def cli_logging_handlers( """Attach terminal (and optional bridge) handlers to the named loggers. Adds and removes only its own handlers, restores prior logger levels on - exit, and never touches the root logger or other handlers — safe to + exit, and never touches the root logger or other handlers - safe to compose with a host application's logging configuration. With `suppress_http_dependency_logs=False`, httpx/httpcore loggers are diff --git a/tests/test_tsv.py b/tests/test_tsv.py index 1e77cd7..aed4bc2 100644 --- a/tests/test_tsv.py +++ b/tests/test_tsv.py @@ -31,9 +31,9 @@ def test_format_tsv_value_sanitizes_and_truncates_non_url_fields(self) -> None: def test_display_width_handles_wide_and_combining_characters(self) -> None: self.assertEqual(display_width("a"), 1) - self.assertEqual(display_width("測"), 2) + self.assertEqual(display_width("\u6e2c"), 2) self.assertEqual(display_width("e\u0301"), 1) - self.assertEqual(pad_display("測", 4), "測 ") + self.assertEqual(pad_display("\u6e2c", 4), "\u6e2c ") def test_write_tsv_creates_aligned_table(self) -> None: with tempfile.TemporaryDirectory() as directory: diff --git a/tests/test_unicode_scan.py b/tests/test_unicode_scan.py new file mode 100644 index 0000000..385aea9 --- /dev/null +++ b/tests/test_unicode_scan.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import unittest + +from tests import unicode_scan + + +class UnicodeScanTests(unittest.TestCase): + def test_ascii_text_has_no_findings(self) -> None: + self.assertEqual(unicode_scan.findings_in_text("plain - ascii 'text' x 2\n"), []) + + def test_non_ascii_characters_are_flagged(self) -> None: + findings = unicode_scan.findings_in_text("first line\na \u2014 b \u2192 c\n") + self.assertEqual(findings, [(2, 3, "\u2014"), (2, 7, "\u2192")]) + + def test_invisible_character_is_flagged(self) -> None: + findings = unicode_scan.findings_in_text("zero\u200bwidth") + self.assertEqual(findings, [(1, 5, "\u200b")]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unicode_scan.py b/tests/unicode_scan.py new file mode 100644 index 0000000..0b27cb7 --- /dev/null +++ b/tests/unicode_scan.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""Fail on non-ASCII characters in tracked text files. + +Usage: uv run python tests/unicode_scan.py +Exit code 1 when any Unicode character outside ASCII is found. +""" + +from __future__ import annotations + +import subprocess +import sys +import unicodedata +from pathlib import Path + + +def findings_in_text(text: str) -> list[tuple[int, int, str]]: + """Return (line, column, character) findings, 1-based.""" + findings: list[tuple[int, int, str]] = [] + for line_number, line in enumerate(text.splitlines(), start=1): + for column_number, character in enumerate(line, start=1): + if not character.isascii(): + findings.append((line_number, column_number, character)) + return findings + + +def tracked_files(root: Path) -> list[Path]: + """Return tracked files the gate should scan.""" + listing = subprocess.run( + ["git", "ls-files", "-z"], + capture_output=True, + text=True, + check=True, + cwd=root, + ) + return [root / name for name in listing.stdout.split("\0") if name] + + +def describe(character: str) -> str: + name = unicodedata.name(character, f"U+{ord(character):04X}") + return f"`{character}` ({name}, U+{ord(character):04X})" + + +def main() -> int: + root = Path(__file__).resolve().parent.parent + finding_count = 0 + for path in tracked_files(root): + try: + text = path.read_text(encoding="utf-8") + except (UnicodeDecodeError, FileNotFoundError, IsADirectoryError): + continue # binary or vanished files are not lintable text + for line_number, column_number, character in findings_in_text(text): + print( + f"{path.relative_to(root)}:{line_number}:{column_number} " + f"non-ASCII character {describe(character)}" + ) + finding_count += 1 + if finding_count: + print(f"\nFound {finding_count} non-ASCII character(s).") + return 1 + print("No non-ASCII characters found.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())