From d8dea9eb960115938c108928a3c58ebae69b6b95 Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Fri, 10 Apr 2026 00:57:05 +0000 Subject: [PATCH] [SPARK-52709][SQL] Fix parsing of STRUCT<> When the lexer sees STRUCT<>, it tokenizes it as STRUCT + NEQ because NEQ (<>) is matched before LT (<) in the lexer. This increments complex_type_level_counter for STRUCT but never decrements it (no GT token), corrupting the counter for subsequent tokens. As a result, `SELECT CAST(null AS STRUCT<>), 2 >> 1` fails because >> is not recognized as shift-right. The previous fix (#51480) modified the NEQ lexer rule but was reverted because it broke ARRAY(col1 <> col2) where <> is the not-equal operator. This fix follows cloud-fan's suggestion to handle it at the parser level. When the parser matches STRUCT followed by NEQ in the dataType rule, it decrements the counter via an inline action. This is safe because the parser has confirmed that NEQ is being used as empty angle brackets in a type context, not as a comparison operator. Closes SPARK-52709 --- .../sql/catalyst/parser/SqlBaseParser.g4 | 2 +- .../sql/execution/SparkSqlParserSuite.scala | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index a905905c098e8..b669b5201cdc1 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -1495,7 +1495,7 @@ primitiveType dataType : complex=ARRAY (LT dataType GT)? #complexDataType | complex=MAP (LT dataType COMMA dataType GT)? #complexDataType - | complex=STRUCT ((LT complexColTypeList? GT) | NEQ)? #complexDataType + | complex=STRUCT ((LT complexColTypeList? GT) | NEQ {((SqlBaseLexer) getTokenStream().getTokenSource()).decComplexTypeLevelCounter();})? #complexDataType | primitiveType #primitiveDataType ; diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index 94e60db67ac75..3d064c904f19d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -1164,4 +1164,27 @@ class SparkSqlParserSuite extends AnalysisTest with SharedSparkSession { } } } + + test("SPARK-52709: STRUCT<> should not corrupt complex_type_level_counter") { + // STRUCT<> is tokenized as STRUCT + NEQ by the lexer. The parser must decrement + // the complex_type_level_counter so that subsequent >> is recognized as shift-right. + // Without the fix, this throws a parse error because >> is not recognized. + parser.parsePlan("SELECT CAST(null AS STRUCT<>), 2 >> 1") + + // Multiple empty structs should not corrupt the counter + parser.parsePlan("SELECT CAST(null AS STRUCT<>), CAST(null AS STRUCT<>), 4 >> 2") + + // Empty struct with unsigned shift right + parser.parsePlan("SELECT CAST(null AS STRUCT<>), 8 >>> 2") + + // ARRAY with <> as not-equal operator should still work + parser.parsePlan("SELECT ARRAY(1 <> 2)") + + // Nested complex types with >> should still work + parser.parsePlan("SELECT CAST(null AS MAP>)") + + // Mix of empty struct and nested complex types + parser.parsePlan( + "SELECT CAST(null AS STRUCT<>), CAST(null AS MAP>), 2 >> 1") + } }