From 50e0c8d93fdde6192d155d3751c42c609330520e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 16 Apr 2026 16:23:01 +0000 Subject: [PATCH] Merge two-pass parser into single pass with array-based output and specific error messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace removeComments() + removeTrailingCommas() with a single processInput() that handles comments and trailing commas in one iteration over the input - The trailing-comma lookahead is now comment-aware: it skips // and /* */ regions when scanning forward for } or ], enabling correct handling of patterns like {"a": 1, // comment\n} - Switch string accumulation from $result .= $char to $result[] = $char with a final implode(), avoiding O(n²) realloc behavior on large inputs - Distinguish unclosed string vs unclosed block comment in error sentinels: "unclosed string literal" vs "unclosed block comment" - Remove isWhitespace() helper (inlined into processInput() lookahead) All 100 existing tests pass. https://claude.ai/code/session_01KABhJ1aojwpPmJ3fGhVxsZ --- src/JSONC.php | 174 +++++++++++++++++++------------------------------- 1 file changed, 65 insertions(+), 109 deletions(-) diff --git a/src/JSONC.php b/src/JSONC.php index f2cbbc6..fabb024 100644 --- a/src/JSONC.php +++ b/src/JSONC.php @@ -36,13 +36,7 @@ public static function parse(string $jsonc): string $jsonc = substr($jsonc, 3); } - // Pass 1: Remove comments - $json = self::removeComments($jsonc); - - // Pass 2: Remove trailing commas - $json = self::removeTrailingCommas($json); - - return $json; + return self::processInput($jsonc); } /** @@ -71,20 +65,20 @@ public static function decode( } /** - * Removes comments from JSONC string while preserving strings + * Processes JSONC string in a single pass: removes comments and trailing commas * - * Uses a state machine to track context and avoid removing - * comment-like syntax inside string values. + * Uses a state machine to track context, skipping comment content and + * dropping trailing commas via a comment-aware lookahead. * - * @param string $input JSONC string with comments - * @return string JSON string without comments + * @param string $input JSONC string after null-byte and BOM removal + * @return string Clean JSON string, or an error sentinel on unclosed constructs */ - private static function removeComments(string $input): string + private static function processInput(string $input): string { - $state = ParserState::Normal; - $result = ''; + $result = []; + $state = ParserState::Normal; $length = strlen($input); - $i = 0; + $i = 0; while ($i < $length) { $char = $input[$i]; @@ -94,20 +88,63 @@ private static function removeComments(string $input): string case ParserState::Normal: if ($char === '"') { $state = ParserState::InString; - $result .= $char; + $result[] = $char; } elseif ($char === '/' && $next === '/') { $state = ParserState::SingleLineComment; $i++; // Skip second '/' } elseif ($char === '/' && $next === '*') { $state = ParserState::MultiLineComment; $i++; // Skip '*' + } elseif ($char === ',') { + // Comment-aware lookahead to detect trailing commas + $j = $i + 1; + $skipped = []; + + while ($j < $length) { + $c = $input[$j]; + $n = ($j + 1 < $length) ? $input[$j + 1] : null; + + if ($c === ' ' || $c === "\t" || $c === "\n" || $c === "\r") { + $skipped[] = $c; + $j++; + } elseif ($c === '/' && $n === '/') { + // Skip single-line comment body; newline is picked up as whitespace + $j += 2; + while ($j < $length && $input[$j] !== "\n" && $input[$j] !== "\r") { + $j++; + } + } elseif ($c === '/' && $n === '*') { + // Skip block comment + $j += 2; + while ($j < $length) { + if ($input[$j] === '*' && ($j + 1 < $length) && $input[$j + 1] === '/') { + $j += 2; + break; + } + $j++; + } + } else { + break; + } + } + + if ($j < $length && ($input[$j] === '}' || $input[$j] === ']')) { + // Trailing comma: drop it, emit accumulated whitespace, jump to closing bracket + foreach ($skipped as $ws) { + $result[] = $ws; + } + $i = $j - 1; // Main loop $i++ lands on closing bracket + } else { + // Not a trailing comma, keep it + $result[] = $char; + } } else { - $result .= $char; + $result[] = $char; } break; case ParserState::InString: - $result .= $char; + $result[] = $char; if ($char === '\\') { $state = ParserState::InStringEscape; } elseif ($char === '"') { @@ -116,13 +153,13 @@ private static function removeComments(string $input): string break; case ParserState::InStringEscape: - $result .= $char; + $result[] = $char; $state = ParserState::InString; break; case ParserState::SingleLineComment: if ($char === "\n" || $char === "\r") { - $result .= $char; // Preserve line breaks + $result[] = $char; // Preserve line breaks $state = ParserState::Normal; } // Otherwise skip character (it's part of the comment) @@ -140,94 +177,13 @@ private static function removeComments(string $input): string $i++; } - // Validate that we ended in a valid state - // This catches unclosed strings, unclosed escape sequences and unclosed comments - // Note: This validation protects both removeComments() and removeTrailingCommas() - if ($state !== ParserState::Normal && $state !== ParserState::SingleLineComment) { - // Return invalid JSON that will fail in json_decode() - return '{JSONC_PARSE_ERROR: unclosed string or comment}'; - } - - return $result; - } - - /** - * Removes trailing commas from JSON string while preserving strings - * - * Uses a state machine to track context and only remove commas - * that appear before closing brackets/braces. - * - * @param string $input JSON string with potential trailing commas - * @return string JSON string without trailing commas - */ - private static function removeTrailingCommas(string $input): string - { - $state = ParserState::Normal; - $result = ''; - $length = strlen($input); - $i = 0; - - while ($i < $length) { - $char = $input[$i]; - - switch ($state) { - case ParserState::Normal: - if ($char === '"') { - $state = ParserState::InString; - $result .= $char; - } elseif ($char === ',') { - // Look ahead to find next non-whitespace character - $j = $i + 1; - $whitespace = ''; - - while ($j < $length && self::isWhitespace($input[$j])) { - $whitespace .= $input[$j]; - $j++; - } - - // Check if comma is trailing (before } or ]) - if ($j < $length && ($input[$j] === '}' || $input[$j] === ']')) { - // Skip comma but preserve whitespace - $result .= $whitespace; - $i = $j - 1; // Will be incremented at end of loop - } else { - // Not a trailing comma, keep it - $result .= $char; - } - } else { - $result .= $char; - } - break; - - case ParserState::InString: - $result .= $char; - if ($char === '\\') { - $state = ParserState::InStringEscape; - } elseif ($char === '"') { - $state = ParserState::Normal; - } - break; - - case ParserState::InStringEscape: - $result .= $char; - $state = ParserState::InString; - break; - } - - $i++; - } - - return $result; - } + // Validate final state and return specific error sentinels for unclosed constructs + $error = match ($state) { + ParserState::Normal, ParserState::SingleLineComment => null, + ParserState::MultiLineComment => '{JSONC_PARSE_ERROR: unclosed block comment}', + ParserState::InString, ParserState::InStringEscape => '{JSONC_PARSE_ERROR: unclosed string literal}', + }; - /** - * Checks if a character is whitespace - * - * @param string $char Single character to check - * @return bool True if whitespace - */ - private static function isWhitespace(string $char): bool - { - return in_array($char, [' ', "\t", "\n", "\r"], true); + return $error ?? implode('', $result); } }