Added support for character class union in regex processing · GeekMasher/codeql@fe6de2f · GitHub
Skip to content

Commit fe6de2f

Browse files
committed
Added support for character class union in regex processing
1 parent 1e05f32 commit fe6de2f

6 files changed

Lines changed: 325 additions & 257 deletions

File tree

Lines changed: 22 additions & 0 deletions

javascript/extractor/src/com/semmle/js/ast/regexp/Visitor.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,6 @@ public interface Visitor {
6767
public void visit(CharacterClassIntersection nd);
6868

6969
public void visit(CharacterClassSubtraction nd);
70+
71+
public void visit(CharacterClassUnion nd);
7072
}

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
1414
import com.semmle.js.ast.regexp.CharacterClassRange;
1515
import com.semmle.js.ast.regexp.CharacterClassSubtraction;
16+
import com.semmle.js.ast.regexp.CharacterClassUnion;
1617
import com.semmle.js.ast.regexp.Constant;
1718
import com.semmle.js.ast.regexp.ControlEscape;
1819
import com.semmle.js.ast.regexp.ControlLetter;
@@ -98,6 +99,7 @@ public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
9899
termkinds.put("CharacterClassQuotedString", 28);
99100
termkinds.put("CharacterClassIntersection", 29);
100101
termkinds.put("CharacterClassSubtraction", 30);
102+
termkinds.put("CharacterClassUnion", 31);
101103
}
102104

103105
private static final String[] errmsgs =
@@ -372,6 +374,14 @@ public void visit(CharacterClassSubtraction nd) {
372374
for (RegExpTerm element : nd.getSubtraction())
373375
visit(element, lbl, i++);
374376
}
377+
378+
@Override
379+
public void visit(CharacterClassUnion nd) {
380+
Label lbl = extractTerm(nd, parent, idx);
381+
int i = 0;
382+
for (RegExpTerm element : nd.getUnion())
383+
visit(element, lbl, i++);
384+
}
375385
}
376386

377387
public void extract(String src, SourceMap sourceMap, Node parent, boolean isSpeculativeParsing, String flags) {

javascript/extractor/src/com/semmle/js/parser/RegExpParser.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import com.semmle.js.ast.regexp.CharacterClassQuotedString;
1010
import com.semmle.js.ast.regexp.CharacterClassRange;
1111
import com.semmle.js.ast.regexp.CharacterClassSubtraction;
12+
import com.semmle.js.ast.regexp.CharacterClassUnion;
1213
import com.semmle.js.ast.regexp.Constant;
1314
import com.semmle.js.ast.regexp.ControlEscape;
1415
import com.semmle.js.ast.regexp.ControlLetter;
@@ -568,6 +569,7 @@ private enum CharacterClassType {
568569
STANDARD,
569570
INTERSECTION,
570571
SUBTRACTION,
572+
UNION
571573
}
572574

573575
// ECMA 2024 `v` flag allows nested character classes.
@@ -599,12 +601,26 @@ else if (lookahead("--")) {
599601
}
600602
}
601603

604+
boolean containsComplex = elements.stream().anyMatch(term -> term instanceof UnicodePropertyEscape ||
605+
term instanceof CharacterClassQuotedString ||
606+
term instanceof CharacterClass);
607+
608+
// Set type to UNION only if:
609+
// 1. We haven't already determined a specific type (intersection/subtraction)
610+
// 2. We have more than one element
611+
// 3. We have at least one complex element (i.e. a nested character class or a UnicodePropertyEscape)
612+
if (containsComplex && classType == CharacterClassType.STANDARD && elements.size() > 1) {
613+
classType = CharacterClassType.UNION;
614+
}
615+
602616
// Create appropriate RegExpTerm based on the detected class type
603617
switch (classType) {
604618
case INTERSECTION:
605619
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassIntersection(loc, elements)), inverted));
606620
case SUBTRACTION:
607621
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassSubtraction(loc, elements)), inverted));
622+
case UNION:
623+
return this.finishTerm(new CharacterClass(loc, Collections.singletonList(new CharacterClassUnion(loc, elements)), inverted));
608624
case STANDARD:
609625
default:
610626
return this.finishTerm(new CharacterClass(loc, elements, inverted));

javascript/extractor/tests/es2024/output/trap/regex_nested_character_class.js.trap

Lines changed: 57 additions & 51 deletions

0 commit comments

Comments
 (0)