Add dangerous Unicode character detection (Trojan Source / glassworm) by andimarek · Pull Request #4344 · graphql-java/graphql-java · GitHub
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion .githooks/pre-commit
54 changes: 53 additions & 1 deletion .github/workflows/validate-files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ name: Validate Files
# so the repo can be cloned on Windows systems.
# 2. File size limits — no files larger than 10 MB. Many enterprise users mirror
# graphql-java into internal repositories that enforce file size limits.
# 3. No dangerous Unicode characters — prevents Trojan Source (BiDi override),
# glassworm, and similar attacks using invisible or control characters.

on:
push:
Expand All @@ -24,7 +26,7 @@ permissions:
jobs:
validate-filenames-and-size:
runs-on: ubuntu-latest
name: Validate Windows Compatibility and File Sizes
name: Validate Files (Windows names, size, Unicode safety)
steps:
- name: Checkout code
uses: actions/checkout@v6
Expand Down Expand Up @@ -96,3 +98,53 @@ jobs:
else
echo "✓ All files are within the 10MB size limit"
fi

- name: Check for dangerous Unicode characters
run: |
echo "Checking for dangerous Unicode characters (Trojan Source / glassworm)..."

# Dangerous character ranges:
# U+0000-0008, U+000B-000C, U+000E-001F C0 control chars (except TAB, LF, CR)
# U+007F-009F DELETE + C1 control chars
# U+200B-200D Zero-width space/non-joiner/joiner
# U+FEFF Zero-width no-break space (BOM)
# U+202A-202E BiDi embedding/override (Trojan Source)
# U+2066-2069 BiDi isolate chars (Trojan Source)

FOUND_FILES=""

while IFS= read -r file; do
if [ ! -f "$file" ]; then
continue
fi
# Skip binary files
if file --mime-type "$file" 2>/dev/null | grep -qv 'text/'; then
continue
fi
MATCHES=$(perl -CSD -ne '
if (/[\x{0000}-\x{0008}\x{000B}\x{000C}\x{000E}-\x{001F}\x{007F}-\x{009F}\x{200B}-\x{200D}\x{FEFF}\x{202A}-\x{202E}\x{2066}-\x{2069}]/) {
print " line $.: $_";
}
' "$file" 2>/dev/null || true)
if [ -n "$MATCHES" ]; then
echo "::error file=${file}::File contains dangerous Unicode characters"
FOUND_FILES="${FOUND_FILES}${file}:\n${MATCHES}\n"
fi
done <<< "$(git ls-files)"

if [ -n "$FOUND_FILES" ]; then
echo ""
echo "The following files contain dangerous Unicode characters:"
echo -e "$FOUND_FILES"
echo ""
echo "These invisible or rendering-altering characters can be used for"
echo "Trojan Source or glassworm-style attacks. Detected categories:"
echo " - C0/C1 control characters (U+0000-001F, U+007F-009F, except TAB/LF/CR)"
echo " - Zero-width characters (U+200B-200D, U+FEFF)"
echo " - BiDi override/isolate (U+202A-202E, U+2066-2069)"
echo ""
echo "Please remove these characters from the affected files."
exit 1
else
echo "✓ No dangerous Unicode characters found"
fi
2 changes: 2 additions & 0 deletions CONTRIBUTING.md