sourcegraph · marcleblanc2 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
@@ -185,6 +185,9 @@ jobs:
       - name: Validate lockfile
         run: uv lock --check
 
+      - name: Scan for Unicode characters
+        run: uv run --frozen python tests/unicode_scan.py
+
       - name: Lint Python
         run: uv run --frozen ruff check .
 

@@ -14,9 +14,8 @@ actionlint
 ### Markdown files
 npx --yes -p markdownlint-cli2@0.22.1 -p markdownlint-rule-relative-links@5.1.0 markdownlint-cli2
 
-### Confusable Unicode characters in non-Python files
-# (ruff RUF001-RUF003 covers Python strings/comments/docstrings)
-uv run python tests/confusables.py
+### Non-ASCII characters in tracked text files
+uv run python tests/unicode_scan.py
 
 ### Python files
 
@@ -41,7 +40,7 @@ own checks.
 
 ```bash
 # Fast, no network (also what the pre-commit hook runs):
-# lint, format, pyright, confusable-character scan, unit + fixture
+# lint, format, pyright, non-ASCII character scan, unit + fixture
 # tests, CLI rejection matrix,
 # randomized permission invariants
 uv run tests/run.py
@@ -64,7 +63,7 @@ uv run tests/run.py --performance --baseline-command "uvx src-auth-perms-sync@la
 uv run tests/run.py --update-golden
 ```
 
-- Fixture cases live in `tests/e2e/fixtures/<case>/` — see the README there
+- Fixture cases live in `tests/e2e/fixtures/<case>/` - see the README there
   for the format. Add cases there to cover new mapping behaviors.
 - For manual verification against a real instance, dry-run first (no
   `--apply`), read the planned changes, then `--apply` on a scratch instance
@@ -165,7 +164,7 @@ gh run watch "${RUN_ID}" --repo "${GH_REPO}" --exit-status
 gh release view "v${VERSION}" --repo "${GH_REPO}"
 ```
 
-## Hard invariants — do not break
+## Hard invariants - do not break
 
 Violating these can silently grant the wrong users access to the wrong
 repos.
@@ -206,31 +205,31 @@ organization sync maps SAML groups to Sourcegraph org membership. Read
 CLI lives in `src/src_auth_perms_sync/`; invoke with `uv run src-auth-perms-sync`.
 Strict pyright covers the package. Root modules are entrypoints only:
 
-- `cli.py` — `main()`, arg parsing, owns the CLI description. Module
+- `cli.py` - `main()`, arg parsing, owns the CLI description. Module
   wrappers (`Get`/`Set`/`Restore`/`SyncSamlOrgs`) return result dataclasses
   and never install logging handlers; only `main()` runs CLI-mode logging.
-- `shared/` — cross-workflow helpers: Sourcegraph auth-provider/user list
+- `shared/` - cross-workflow helpers: Sourcegraph auth-provider/user list
   helpers, shared GraphQL operations and TypedDicts, site-config validation,
   and SAML group parsing. `shared/backups.py` defines `RunPaths`: every
   filesystem path for one run, resolved once at the edge
-  (`resolve_run_paths`) and threaded explicitly — never recompute paths
+  (`resolve_run_paths`) and threaded explicitly - never recompute paths
   from cwd or globals below the edge, and honor `run_paths.write_files`
   (False under `--no-files`) before any disk write.
 
 Business workflows live in packages:
 
-- `permissions/` — repo permission sync (`command.py`, `maps.py`,
+- `permissions/` - repo permission sync (`command.py`, `maps.py`,
   `mapping.py`, `sourcegraph.py`, `snapshot.py`, `apply.py`, `queries.py`,
   `types.py`). Add new mapping filters in `permissions/types.py` and
   `permissions/mapping.py`.
-- `orgs/` — SAML group → Sourcegraph organization sync (`command.py`,
+- `orgs/` - SAML group -> Sourcegraph organization sync (`command.py`,
   `queries.py`, `types.py`).
 
 ## Toolchain
 
 - Python 3.11 + [uv](https://docs.astral.sh/uv/). Never invoke `python`
   directly; always `uv run ...`.
-- `uv run pyright` must be clean. No `# type: ignore` to silence —
+- `uv run pyright` must be clean. No `# type: ignore` to silence -
   fix the underlying type.
 - Local tests use stdlib `unittest`: `uv run python -m unittest discover -s tests`.
 - For Sourcegraph mutation-path changes, also verify by dry-running `--get` /
@@ -268,7 +267,7 @@ Short names that ARE acceptable (don't rewrite these on sight):
   `key`, `value`, `name`, `kind`.
 - **Stdlib idioms**: `ctx` for `contextvars.copy_context()`.
 - **Loop / comprehension variables when the type is obvious from one
-  line of context** is still discouraged — prefer `user`, `repo`,
+  line of context** is still discouraged - prefer `user`, `repo`,
   `provider`, `service`, `permission`, `node`, `entry`, `match`,
   `account`, `future`, `executor`, `exception`, `event`, `connection`,
   `response`, `timestamp`, `current`, `outcome`, `index`, `field_name`.

@@ -1,5 +1,5 @@
 # src-auth-perms-sync
-<!-- HUMAN-MAINTAINED SECTION START — DO NOT EDIT THIS SECTION -->
+<!-- HUMAN-MAINTAINED SECTION START - DO NOT EDIT THIS SECTION -->
 
 src-auth-perms-sync automates Sourcegraph's Explicit Permissions GraphQL API,
 setting user-to-repo permissions based on mapping rules, for example:
@@ -147,7 +147,7 @@ for provider in get_result.auth_providers:
 for code_host in get_result.code_hosts:
     ...
 
-# Mapping rules can be passed in memory instead of a maps YAML file —
+# Mapping rules can be passed in memory instead of a maps YAML file -
 # same structure and validation as maps.yaml entries:
 rules: list[src.MappingRule] = [
     {
@@ -167,11 +167,11 @@ result = src.Set(config, mapping_rules=rules)
 ```
 
 The import API does not read environment variables or `.env` files on its
-own — those apply to the CLI only. Pass every value explicitly to
+own - those apply to the CLI only. Pass every value explicitly to
 `src.Config(...)` (read `os.environ` yourself if you want env-driven
 configuration, as the example above does).
 
-Module mode never touches your `logging` handlers or the root logger — your
+Module mode never touches your `logging` handlers or the root logger - your
 application's logging config stays in charge. To see progress messages:
 
 ```python
@@ -297,22 +297,22 @@ snapshots that make `--apply` reversible.
     src-auth-perms-sync sync-saml-orgs --users-without-explicit-perms
     ```
 
-    - Same user filters as `get` and `set`; a mode flag is required — there
+    - Same user filters as `get` and `set`; a mode flag is required - there
       is no bare `sync-saml-orgs`
 
 ### Org sync behavior
 
 - Org names are `synced-<configID>-<group name>` (non-alphanumeric characters
   become `-`). The `synced-` prefix marks tool ownership: the sync only ever
   modifies orgs whose name carries it, so manually created orgs are never touched.
-- The org sync mode is always explicit — no surprises:
+- The org sync mode is always explicit - no surprises:
   - **Full** (`sync-saml-orgs --full`, or `set --full` / `--repos*`
     `--sync-saml-orgs`): converges every synced org against all users. A synced
     org whose SAML group disappeared has all members removed, but the org itself
     is kept (its settings survive in case the group comes back).
   - **Scoped** (user filters on `sync-saml-orgs`, or `set --users` /
     `--users-without-explicit-perms` / `--created-after` with
-    `--sync-saml-orgs`): syncs org membership for exactly the selected users —
+    `--sync-saml-orgs`): syncs org membership for exactly the selected users -
     per-user additions AND removals, computed from each user's own SAML
     assertion and org list. Other users' memberships never change, and no full
     user scan or org member listing is needed, so API traffic stays
@@ -326,16 +326,16 @@ Run `src-auth-perms-sync --help` for options
 
 ```text
 src-auth-perms-sync-runs/<src_endpoint>/
-├── auth-providers.yaml
-├── code-hosts.yaml
-├── maps.yaml
-└── runs
-    └── timestamp-command
-        ├── before.json
-        ├── after.json
-        ├── diff.json
-        ├── log.json
-        └── maps.yaml
+|-- auth-providers.yaml
+|-- code-hosts.yaml
+|-- maps.yaml
+`-- runs
+    `-- timestamp-command
+        |-- before.json
+        |-- after.json
+        |-- diff.json
+        |-- log.json
+        `-- maps.yaml
 ```
 
 - The `src-auth-perms-sync-runs` dir is created under your current working directory
@@ -353,4 +353,4 @@ src-auth-perms-sync-runs/<src_endpoint>/
   - An `after.json` file, capturing the new state
   - A `diff.json` file, a shorter, reviewable file containing the diffs between before and after
 
-<!-- HUMAN-MAINTAINED SECTION END — DO NOT EDIT ABOVE -->
+<!-- HUMAN-MAINTAINED SECTION END - DO NOT EDIT ABOVE -->
@@ -32,7 +32,7 @@
 - `get --repos <name>` still scans every user's explicit grants to find one
   repo's holders (~400 s at 10k users). A repo-centric read
   (`repository.permissionsInfo.users` + site-admin disambiguation, as the
-  test harness already does) would make it seconds — see the repo-centric
+  test harness already does) would make it seconds - see the repo-centric
   section below.
 
 ## Low priority: Repo-centric path, when users > repos, or for cross-checking
@@ -50,10 +50,10 @@ Reasons we might want to bring it back later:
 - **Targeted snapshots.** A "planned-scope" capture (only the repos
   the mapping rules touch) is faster than a full instance scan when
   the user-centric path is the long pole AND the planned set is small.
-  Would need either a server-side `source` filter on the repo→users
+  Would need either a server-side `source` filter on the repo->users
   connection, or a follow-up user-centric `source: API` query per
   ambiguous (site-admin) user to disambiguate.
-- **Adaptive capture path after SG adds `source` to repo→users.** Once
+- **Adaptive capture path after SG adds `source` to repo->users.** Once
   `RepositoryPermissionsInfo.users(source: PermissionSource)` exists,
   compute the expected request count both ways before snapshotting:
   sum `userCount` across all auth providers and sum `repoCount` across
@@ -70,7 +70,7 @@ If/when we revisit:
    ambiguous-user follow-up to be correct).
 2. Restore `QUERY_REPO_EXPLICIT_USERS` from git history; implement
    `list_repo_explicit_users` returning `(definitely, ambiguous)` and
-   actually consume both buckets — the previous code did neither.
+   actually consume both buckets - the previous code did neither.
 3. Add a CLI flag (e.g. `--cross-check-capture`) gated behind a clear
    "this doubles capture cost" warning.
 
@@ -89,7 +89,7 @@ SAML actually persists the group list. Recovery options for each:
 - OIDC has no `allowGroups` field on `OpenIDConnectAuthProvider`.
   `UserClaims` stores only name/email fields; the `groups` claim is never
   parsed. Recovery needs an upstream change to persist the claim.
-- GitHub OAuth has `allowOrgs`, `allowOrgsMap` (org→teams), and
+- GitHub OAuth has `allowOrgs`, `allowOrgsMap` (org->teams), and
   `requiredSsoOrgs`. Org/team checks happen live in `verifyUserOrgs` /
   `verifyUserTeams` and are discarded. Recovery needs an upstream change to
   persist the claim.

@@ -88,15 +88,15 @@ Observed during the concurrent captures:
 - `pgsql-0` CPU (`kubectl top`): 7,636-7,683 millicores of 8,000 (saturated).
 - `frontend` / `gitserver` CPU: 124-138m / 2-3m (idle bystanders).
 - `pg_stat_activity`: 29 active statements, all
-  `permsStore.ListUserPermissions`, **zero wait events** — pure CPU, no lock
+  `permsStore.ListUserPermissions`, **zero wait events** - pure CPU, no lock
   contention.
 - `pg_stat_statements`: `permsStore.ListUserPermissions` at 24,026 calls,
   27,635.6s total, 1,150ms mean.
-- Per-client capture throughput: 23 users/sec solo → 2-4 users/sec at 4-way
+- Per-client capture throughput: 23 users/sec solo -> 2-4 users/sec at 4-way
   concurrency.
-- Aggregate throughput: 8-16 users/sec at 4-way — **below the 23 users/sec a
+- Aggregate throughput: 8-16 users/sec at 4-way - **below the 23 users/sec a
   single client achieves alone** (negative scaling).
-- ALB (CloudWatch): no 5xx, no rejected connections — the edge and frontend
+- ALB (CloudWatch): no 5xx, no rejected connections - the edge and frontend
   are not the bottleneck.
 - Collateral failure: the fifth client's queries exceeded the 60s read timeout
   under this load; 5 retry attempts exhausted; its run failed with exit 1.
@@ -144,7 +144,7 @@ from `github.com/sourcegraph/sourcegraph`:
 Measured on the 10k-user / 50k-repo test instance, the presence probe
 `User.permissionsInfo.repositories(source: API, first: 1)` costs 225-350ms of
 server work per user, and alias batching barely helps (21,004 single-user
-probes averaged 351.6ms; 25-user batches averaged 5,616ms ≈ 224.7ms/user). A
+probes averaged 351.6ms; 25-user batches averaged 5,616ms ~ 224.7ms/user). A
 single `set --users-without-explicit-perms` run probing all 10,002 users at
 batch size 1 spent 4,269s of its 5,210s total in these probes.
 
@@ -158,9 +158,9 @@ Reading the resolver code in `github.com/sourcegraph/sourcegraph` explains why
   `updatedAt` fields. The rows are discarded afterward.
 - `userPermissionsInfoResolver.Repositories` (permissions_info.go) builds a
   CTE (`reposPermissionsInfoQueryFmt` in perms_store.go) that **materializes
-  every repo accessible to the user** — a full `repo` ⋈
-  `external_service_repos` ⋈ `external_services` join with the correlated
-  authz `EXISTS` predicate and an `ORDER BY` — before the outer query applies
+  every repo accessible to the user** - a full `repo` join
+  `external_service_repos` join `external_services` join with the correlated
+  authz `EXISTS` predicate and an `ORDER BY` - before the outer query applies
   `urp.source = 'API'` and the LIMIT. `first: 1` becomes `LIMIT 2` on the
   outer query only; the CTE is not short-circuited.
 - Requesting `totalCount` adds a second independent execution of the same CTE
@@ -180,7 +180,7 @@ Client-side mitigation shipped in `src-auth-perms-sync` (2026-06-12):
 locally BEFORE probing, so probes scale with the rule-matched user count
 instead of the instance's user count, and user hydration runs as aliased
 25-user batches instead of one `UserByID` request per user. The remaining
-inherent cost — ~225ms x probed user — is exactly what the
+inherent cost - ~225ms x probed user - is exactly what the
 presence/filter API requested below would remove, and
 `get --users-without-explicit-perms` still has to probe every active user
 because its semantics are instance-wide.

@@ -470,7 +470,7 @@ def write_text_report(
         )
     r_squared = "n/a" if model.r_squared is None else f"{model.r_squared:.4f}"
     print("\nFit quality:")
-    print(f"  R²: {r_squared}")
+    print(f"  R^2: {r_squared}")
     print(f"  mean absolute error: {model.mean_absolute_error_megabytes:.2f} MiB")
     print(f"  p95 absolute error: {model.p95_absolute_error_megabytes:.2f} MiB")
     print(f"  max absolute error: {model.max_absolute_error_megabytes:.2f} MiB")

@@ -1,4 +1,4 @@
-# User → Repo permission mapping rules
+# User -> Repo permission mapping rules
 
 # Maintain your maps.yaml file, using the values from auth-providers.yaml and code-hosts.yaml,
 # which are created by the --get command, under `src-auth-perms-sync-runs/<endpoint>/`

@@ -30,9 +30,9 @@ classifiers = [
     "Typing :: Typed",
 ]
 dependencies = [
-    "json5>=0.14.0",
+    "json5>=0.15.0",
     "pyyaml>=6.0.3",
-    "src-py-lib[otel]==0.3.0",
+    "src-py-lib[otel]==0.3.1",
 ]
 keywords = [
     "Sourcegraph"
@@ -63,9 +63,7 @@ line-length = 100
 extend-exclude = ["src-auth-perms-sync-runs"]
 
 [tool.ruff.lint]
-# RUF001-RUF003 ban confusable Unicode (en dash, multiplication sign,
-# lookalike letters) in Python strings, comments, and docstrings;
-# tests/confusables.py applies the same policy to non-Python files.
+# tests/unicode_scan.py bans non-ASCII characters in tracked text files.
 select = ["E", "F", "I", "B", "UP", "SIM", "RUF001", "RUF002", "RUF003"]
 
 [tool.uv]

@@ -85,7 +85,7 @@ def _load_organization_sync_state(
         if provider["serviceType"] == saml_groups.SAML_SERVICE_TYPE
     ]
     if not saml_providers:
-        log.warning("No SAML auth providers found — nothing to sync.")
+        log.warning("No SAML auth providers found - nothing to sync.")
         return None
     attribute_names_by_provider = saml_groups.attribute_names_by_provider_key(
         providers, saml_groups_attribute_name_by_config_id
@@ -104,7 +104,7 @@ def _load_organization_sync_state(
         # Truncated discovery: resolve target names individually and skip
         # orphaned-org cleanup this run.
         if not targets:
-            log.warning("No SAML group memberships found in user accountData — nothing to sync.")
+            log.warning("No SAML group memberships found in user accountData - nothing to sync.")
             return None
         current_user, current_states = _load_current_organization_states(
             client,
@@ -131,7 +131,7 @@ def _load_organization_sync_state(
         if not targets:
             log.warning(
                 "No SAML group memberships found in user accountData "
-                "and no synced orgs exist — nothing to sync."
+                "and no synced orgs exist - nothing to sync."
             )
             return None
         current_states = {
@@ -179,7 +179,7 @@ def _load_scoped_organization_sync_state(
 
     Each user's `accountData` is the complete truth for that user's SAML
     groups, and their own org list is the complete truth for their current
-    synced-org memberships — so additions AND removals are both safe per
+    synced-org memberships - so additions AND removals are both safe per
     user. Users outside the scope are never touched, and neither full user
     streams nor org member pages are loaded.
     """
@@ -217,7 +217,7 @@ def _load_scoped_organization_sync_state(
         len(target.desired_members_by_id) for target in targets.values()
     )
     if not targets:
-        log.info("Selected user(s) hold no SAML group or synced org memberships — nothing to sync.")
+        log.info("Selected user(s) hold no SAML group or synced org memberships - nothing to sync.")
         return None
 
     # Org IDs for orgs the scoped users belong to come from their own org
@@ -307,7 +307,7 @@ def _log_organization_sync_plan(sync_state: _OrganizationSyncState) -> None:
         len(sync_state.plan["removals"]),
     )
     log.info(
-        "Diff (current → desired):\n%s",
+        "Diff (current -> desired):\n%s",
         _render_organization_diff(sync_state.before_snapshot, sync_state.expected_snapshot),
     )
 
@@ -448,7 +448,7 @@ def _finish_organization_apply_with_backup(
     )
     if current_user_after["id"] != sync_state.current_user["id"]:
         log.warning(
-            "Current user changed during org sync (%s → %s); validation still uses org members.",
+            "Current user changed during org sync (%s -> %s); validation still uses org members.",
             sync_state.current_user["username"],
             current_user_after["username"],
         )