graphframes · SemyonSinchenko · Jun 24, 2026 · Jun 24, 2026
diff --git a/build.sbt b/build.sbt
@@ -11,6 +11,11 @@ lazy val scalaVersions = sparkMajorVer match {
   case "3" => Seq("2.12.21", "2.13.18")
   case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.")
 }
+lazy val antlr4ToolVersion = sys.props.getOrElse("spark.version", "3.5.8").substring(0, 1) match {
+  case "4" => "4.13.1"
+  case "3" => "4.9.3"
+  case v => throw new IllegalArgumentException(s"Unsupported Spark version major: $v")
+}
 lazy val scalaVer = sys.props.getOrElse("scala.version", scalaVersions.head)
 lazy val defaultScalaTestVer = "3.2.19"
 lazy val jmhVersion = "1.37"
@@ -75,14 +80,20 @@ lazy val commonSetting = Seq(
     "org.apache.spark" %% "spark-sql" % sparkVer % "provided" cross CrossVersion.for3Use2_13,
     "org.apache.spark" %% "spark-mllib" % sparkVer % "provided" cross CrossVersion.for3Use2_13,
     "org.slf4j" % "slf4j-api" % "2.0.17" % "provided",
-    "org.apache.datasketches" % "datasketches-java" % "6.2.0", // transitive dependency from Spark
+    "org.apache.datasketches" % "datasketches-java" % "6.2.0" % "provided", // transitive from Spark
+    "org.antlr" % "antlr4" % antlr4ToolVersion % "provided", // transitive from Spark
     "org.scalatest" %% "scalatest" % defaultScalaTestVer % Test,
     "com.github.zafarkhaja" % "java-semver" % "0.10.2" % Test),
   Compile / doc / scalacOptions ++= Seq(
     "-groups",
     "-implicits",
     "-skip-packages",
-    Seq("org.apache.spark").mkString(":")),
+    // org.apache.spark is skipped to avoid rendering transitive Spark types; the GQL query engine
+    // under org.graphframes.propertygraph.internal is entirely private[propertygraph] (and
+    // AstBuilder references generated ANTLR Java types), so it is skipped from rendered output too.
+    // The internal package is still type-checked so that the public PropertyGraphFrame, which calls
+    // into it, compiles in doc.
+    Seq("org.apache.spark", "org.graphframes.propertygraph.internal").mkString(":")),
   Test / doc / scalacOptions ++= Seq("-groups", "-implicits"),
 
   // Test settings
@@ -158,13 +169,18 @@ lazy val graphx = (project in file("graphx"))
 
 lazy val core = (project in file("core"))
   .dependsOn(graphx)
+  .enablePlugins(GraphFramesAntlr4Plugin)
   .settings(
     commonSetting,
     name := "graphframes",
     moduleName := s"${name.value}-spark$sparkMajorVer",
     // Export the JAR so that this can be excluded from shading in connect
     exportJars := true,
 
+    // Emit the generated GQL parser/lexer into the internal package so the
+    // (forthcoming) AstBuilder can import them.
+    antlr4GenPackage := Some("org.graphframes.propertygraph.internal"),
+
     // Global settings
     Global / concurrentRestrictions := Seq(Tags.limitAll(1)),
     autoAPIMappings := true,

diff --git a/core/src/main/antlr4/org/graphframes/propertygraph/internal/GqlLexer.g4 b/core/src/main/antlr4/org/graphframes/propertygraph/internal/GqlLexer.g4
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+lexer grammar GqlLexer;
+
+// ---------------------------------------------------------------------------
+// Keywords (case-insensitive). These MUST precede IDENTIFIER so that, on a
+// tie, the keyword token wins over the identifier rule.
+// ---------------------------------------------------------------------------
+MATCH:   M A T C H;
+WHERE:   W H E R E;
+RETURN:  R E T U R N;
+AND:     A N D;
+OR:      O R;
+NOT:     N O T;
+AS:      A S;
+TRUE:    T R U E;
+FALSE:   F A L S E;
+NULL:    N U L L;
+IS:      I S;
+IN:      I N;
+
+// ---------------------------------------------------------------------------
+// Punctuation and operators.
+//
+// ANTLR4 uses maximal munch: the longest match always wins, and on ties the
+// rule listed first wins. Multi-character tokens are therefore safe even when
+// they share a prefix with a shorter one (e.g. '->' beats '-' regardless of
+// ordering), but equal-length alternatives must be ordered with care. None of
+// the tokens below collide on (length, prefix), so ordering within a group is
+// not load-bearing; groups are kept before IDENTIFIER for clarity only.
+// ---------------------------------------------------------------------------
+ARROW_RIGHT: '->';
+ARROW_LEFT:  '<-';
+LTE:         '<=';
+GTE:         '>=';
+NEQ:         '<>';
+NEQ_BANG:    '!=';
+LT:          '<';
+GT:          '>';
+EQ:          '=';
+DASH:        '-';
+PLUS:        '+';
+STAR:        '*';
+SLASH:       '/';
+PERCENT:     '%';
+DOT:         '.';
+DOTDOT:      '..';
+COMMA:       ',';
+COLON:       ':';
+LPAREN:      '(';
+RPAREN:      ')';
+LBRACK:      '[';
+RBRACK:      ']';
+LBRACE:      '{';
+RBRACE:      '}';
+
+// ---------------------------------------------------------------------------
+// Literals
+// ---------------------------------------------------------------------------
+
+// Single-quoted string with '' escape, per SQL/GQL convention.
+STRING_LITERAL: '\'' ( ~'\'' | '\'\'' )* '\'';
+
+DECIMAL_LITERAL: DIGIT+ '.' DIGIT+;
+INTEGER_LITERAL: DIGIT+;
+
+// ---------------------------------------------------------------------------
+// Identifiers. Must follow all keyword rules so keywords win the tie.
+// ---------------------------------------------------------------------------
+IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*;
+
+// ---------------------------------------------------------------------------
+// Whitespace and comments -> skipped.
+// ---------------------------------------------------------------------------
+WS:            [ \t\r\n\u000C]+ -> skip;
+LINE_COMMENT:  '//' ~[\r\n]*       -> skip;
+BLOCK_COMMENT: '/*' .*? '*/'        -> skip;
+
+// ---------------------------------------------------------------------------
+// Fragments
+// ---------------------------------------------------------------------------
+fragment DIGIT: [0-9];
+
+// Letter fragments used to build case-insensitive keywords.
+fragment A: ('a' | 'A');
+fragment B: ('b' | 'B');
+fragment C: ('c' | 'C');
+fragment D: ('d' | 'D');
+fragment E: ('e' | 'E');
+fragment F: ('f' | 'F');
+fragment G: ('g' | 'G');
+fragment H: ('h' | 'H');
+fragment I: ('i' | 'I');
+fragment J: ('j' | 'J');
+fragment K: ('k' | 'K');
+fragment L: ('l' | 'L');
+fragment M: ('m' | 'M');
+fragment N: ('n' | 'N');
+fragment O: ('o' | 'O');
+fragment P: ('p' | 'P');
+fragment Q: ('q' | 'Q');
+fragment R: ('r' | 'R');
+fragment S: ('s' | 'S');
+fragment T: ('t' | 'T');
+fragment U: ('u' | 'U');
+fragment V: ('v' | 'V');
+fragment W: ('w' | 'W');
+fragment X: ('x' | 'X');
+fragment Y: ('y' | 'Y');
+fragment Z: ('z' | 'Z');
diff --git a/core/src/main/antlr4/org/graphframes/propertygraph/internal/GqlParser.g4 b/core/src/main/antlr4/org/graphframes/propertygraph/internal/GqlParser.g4
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parser grammar GqlParser;
+
+options {
+    tokenVocab = GqlLexer;
+}
+
+// ---------------------------------------------------------------------------
+// Top-level statement.
+//
+// RETURN is optional in the grammar so the engine can default to returning
+// matched IDs when the user omits it (forward-compatible; trivially tightened
+// later by making RETURN mandatory).
+// ---------------------------------------------------------------------------
+gqlStatement
+    : MATCH matchPattern (WHERE whereClause)? (RETURN returnClause)? EOF
+    ;
+
+// A match pattern is a chain of alternating nodes and directed edges,
+// e.g. (a:Person)-[:KNOWS]->(b:Person)-[:WORKS_AT]->(c:Company).
+matchPattern
+    : nodePattern (edgePattern nodePattern)*
+    ;
+
+// Node pattern: typed (a:Person), untyped (x), or anonymous ().
+nodePattern
+    : LPAREN (variable=IDENTIFIER)? (COLON label=IDENTIFIER)? RPAREN
+    ;
+
+// Edge pattern.
+//
+// Three forms:
+//   -[e:KNOWS]->     (left-to-right)
+//   <-[e:KNOWS]-     (right-to-left)
+//   -[:KNOWS]-       (undirected)
+// The edge body [variable? :label?] is shared via edgeBody.
+edgePattern
+    : DASH    edgeBody ARROW_RIGHT   // a -[e]-> b
+    | ARROW_LEFT edgeBody DASH       // a <-[e]- b
+    | DASH    edgeBody DASH          // a -[e]- b
+    ;
+
+edgeBody
+    : LBRACK (variable=IDENTIFIER)? (COLON label=IDENTIFIER)? quantifier? RBRACK
+    ;
+
+// Variable-length pattern: [KNOWS*1..3] or [KNOWS*3]
+quantifier
+    : STAR lo=INTEGER_LITERAL DOTDOT hi=INTEGER_LITERAL   // *1..3  (bounded range)
+    | STAR exact=INTEGER_LITERAL                          // *3     (exactly 3 hops)
+    ;
+
+// ---------------------------------------------------------------------------
+// WHERE clause: a single boolean expression.
+// ---------------------------------------------------------------------------
+whereClause
+    : expression
+    ;
+
+// ---------------------------------------------------------------------------
+// RETURN clause: either SELECT * or a comma-separated list of items.
+// ---------------------------------------------------------------------------
+returnClause
+    : STAR
+    | returnItem (COMMA returnItem)*
+    ;
+
+returnItem
+    : expression (AS alias=IDENTIFIER)?
+    ;
+
+// ---------------------------------------------------------------------------
+// Expression grammar.
+//
+// Precedence (lowest -> highest): OR < AND < NOT < comparison < additive <
+// multiplicative < primary. Standard recursive-descent shape; ANTLR4 resolves
+// left-recursive alternatives correctly.
+// ---------------------------------------------------------------------------
+expression
+    : orExpr
+    ;
+
+orExpr
+    : andExpr (OR andExpr)*
+    ;
+
+andExpr
+    : notExpr (AND notExpr)*
+    ;
+
+notExpr
+    : NOT notExpr
+    | comparison
+    ;
+
+comparison
+    : additive (compOp additive)?
+    ;
+
+additive
+    : multiplicative ((PLUS | DASH) multiplicative)*
+    ;
+
+multiplicative
+    : primary ((STAR | SLASH | PERCENT) primary)*
+    ;
+
+primary
+    : LPAREN expression RPAREN
+    | literal
+    | functionCall
+    | propertyAccess
+    | variable=IDENTIFIER
+    ;
+
+functionCall
+    : name=IDENTIFIER LPAREN ( expression ( COMMA expression )* )? RPAREN
+    ;
+
+propertyAccess
+    : variable=IDENTIFIER DOT property=IDENTIFIER
+    ;
+
+compOp
+    : EQ
+    | NEQ
+    | NEQ_BANG
+    | LT
+    | LTE
+    | GT
+    | GTE
+    ;
+
+literal
+    : INTEGER_LITERAL
+    | DECIMAL_LITERAL
+    | STRING_LITERAL
+    | TRUE
+    | FALSE
+    | NULL
+    ;