diff --git a/.github/workflows/pr-security-lint.yaml b/.github/workflows/pr-security-lint.yaml index 7c1cb63c..241f7c58 100644 --- a/.github/workflows/pr-security-lint.yaml +++ b/.github/workflows/pr-security-lint.yaml @@ -1,27 +1,35 @@ name: PR Security Lint +# SECURITY: This workflow uses pull_request_target intentionally so that the +# workflow definition runs from the BASE branch (main), not the PR. The +# composite action it invokes lives at a pinned 40-char SHA in +# weaviate/weaviate — attackers cannot alter the lint logic via a PR or by +# tampering with an upstream tag. +# +# Rules: +# 1. Do NOT add `ref: ${{ github.event.pull_request.head.sha }}` or any +# reference to PR-controlled refs. The composite uses the GitHub API to +# fetch the diff text — no PR code is ever executed. +# 2. Do NOT add secrets to this workflow. The pull_request_target context +# grants a token with write access to the base repo and access to all +# repo secrets if any are referenced. We reference none and request +# minimal permissions; keep it that way. +# 3. Keep the composite action pinned to a full-length commit SHA. Tag or +# branch refs would let an upstream change alter the lint logic at +# execution time. on: pull_request_target: - types: [opened, synchronize, reopened] -# No permissions at workflow level — grant only what's needed at job level permissions: {} jobs: - hidden-unicode-check: - name: Check for hidden Unicode characters + hidden-unicode: + name: hidden unicode characters runs-on: ubuntu-latest permissions: - contents: read - pull-requests: read + pull-requests: read # required by the composite's `gh pr diff` call steps: - - name: Checkout base branch - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: weaviate/weaviate/.github/actions/security-lint@3e52fc80a244f4644d4facc6a4e705ea6eda9039 # PR #11093 with: - ref: ${{ github.event.pull_request.base.sha }} - - - name: Check PR diff for hidden Unicode - env: - GH_TOKEN: ${{ github.token }} - run: | - gh pr diff ${{ github.event.pull_request.number }} | bash tools/linter_hidden_unicode.sh --stdin + pr-number: ${{ github.event.pull_request.number }} + github-token: ${{ github.token }} diff --git a/tools/linter_hidden_unicode.sh b/tools/linter_hidden_unicode.sh deleted file mode 100755 index c63a6edb..00000000 --- a/tools/linter_hidden_unicode.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env bash -# Lint for hidden/invisible Unicode characters in diffs (trojan-source attack prevention). -# Requires Perl (pre-installed on GitHub Actions Ubuntu runners). -# -# Usage: -# bash tools/linter_hidden_unicode.sh --stdin # read diff from stdin (CI mode) -# bash tools/linter_hidden_unicode.sh # diff against a base ref -# bash tools/linter_hidden_unicode.sh # diff staged changes (git diff --cached) - -set -euo pipefail - -# Binary file extensions to skip -BINARY_PATTERN='\.(png|jpg|jpeg|gif|ico|svg|woff2?|ttf|eot|otf|zip|tar|gz|bz2|xz|7z|rar|pdf|dll|exe|so|dylib|o|obj|class|jar|war|pyc|pyo|wasm|bin|dat|db|sqlite|nupkg|snupkg)$' - -get_diff() { - if [[ "${1:-}" == "--stdin" ]]; then - cat - elif [[ -n "${1:-}" ]]; then - # Validate ref argument to prevent command injection - if ! [[ "$1" =~ ^[a-zA-Z0-9._/-]+$ ]]; then - echo "ERROR: Invalid ref argument: $1" >&2 - exit 1 - fi - if ! git rev-parse --verify "$1" >/dev/null 2>&1; then - echo "ERROR: Git ref not found: $1" >&2 - exit 2 - fi - git diff "$1" - else - git diff --cached - fi -} - -# Perl script that: -# 1. Tracks current file from diff headers -# 2. Skips binary files -# 3. Scans only added lines (starting with +, excluding +++ headers) -# 4. Detects ~30+ categories of invisible/suspicious Unicode characters -PERL_SCRIPT=' -use utf8; -use strict; -use warnings; - -sub escape_property { - my ($s) = @_; - $s =~ s/%/%25/g; - $s =~ s/\r/%0D/g; - $s =~ s/\n/%0A/g; - $s =~ s/:/%3A/g; - $s =~ s/,/%2C/g; - return $s; -} - -sub escape_message { - my ($s) = @_; - $s =~ s/%/%25/g; - $s =~ s/\r/%0D/g; - $s =~ s/\n/%0A/g; - return $s; -} - -my $file = ""; -my $line_in_file = 0; -my $errors = 0; -my $in_binary = 0; -my $binary_pattern = qr/'"$BINARY_PATTERN"'/i; - -while () { - chomp; - - # Track file from diff headers - if (/^\+\+\+ b\/(.+)$/) { - $file = $1; - $line_in_file = 0; - $in_binary = ($file =~ $binary_pattern) ? 1 : 0; - next; - } - - # Skip binary file markers - if (/^Binary files/) { - $in_binary = 1; - next; - } - - # Track hunk headers for line numbers - if (/^@@ -\d+(?:,\d+)? \+(\d+)/) { - $line_in_file = $1 - 1; - next; - } - - # Count lines in the new file - if (/^\+/ || /^ /) { - $line_in_file++; - } - - # Only scan added lines, skip binary files - next if $in_binary; - next unless /^\+/; - next if /^\+\+\+ (?:$|b\/|\/dev\/null)/; - - # Remove the leading + for scanning - my $content = substr($_, 1); - - # Check for suspicious invisible Unicode characters: - # - Bidi overrides and isolates (U+200E-200F, U+202A-202E, U+2066-2069) - # - Zero-width characters (U+200B-200D, U+2060) - # - Byte order mark mid-line (U+FEFF) - # - Soft hyphen (U+00AD) - # - Mongolian vowel separator (U+180E) - # - Combining grapheme joiner (U+034F) - # - Function application and invisible operators (U+2061-2064) - # - Hangul fillers (U+115F, U+1160, U+3164, U+FFA0) - # - Interlinear annotation (U+FFF9-FFFB) - # - Object replacement / replacement char (U+FFFC-FFFD) -- FFFD is sometimes legitimate - # - Unicode tag block (U+E0001, U+E0020-E007F) - # - Deprecated format chars (U+206A-206F) - if ($content =~ /([\x{00AD}\x{034F}\x{115F}\x{1160}\x{180E}\x{200B}-\x{200F}\x{202A}-\x{202E}\x{2060}-\x{2064}\x{2066}-\x{2069}\x{206A}-\x{206F}\x{3164}\x{FE00}-\x{FE0F}\x{FEFF}\x{FFA0}\x{FFF9}-\x{FFFB}\x{E0001}\x{E0020}-\x{E007F}])/) { - my $char = $1; - my $codepoint = sprintf("U+%04X", ord($char)); - my $col = $-[1] + 1; - - if ($ENV{GITHUB_ACTIONS}) { - my $efile = escape_property($file); - my $emsg = escape_message("Hidden Unicode character ${codepoint} found"); - print "::error file=${efile},line=${line_in_file},col=${col}::${emsg}\n"; - } else { - print "ERROR: $file:$line_in_file:$col - Hidden Unicode character $codepoint found\n"; - } - $errors++; - } -} - -if ($errors > 0) { - print "\nFound $errors hidden Unicode character(s) in added lines.\n"; - print "These may indicate a trojan-source attack. See https://trojansource.codes/\n"; - exit 1; -} else { - print "No hidden Unicode characters detected.\n"; - exit 0; -} -' - -get_diff "$@" | perl -CS -e "$PERL_SCRIPT"