From 4a063eb4cbf77d2746ae5ea1d74901fb2352cba7 Mon Sep 17 00:00:00 2001 From: dhuck Date: Wed, 15 Mar 2023 17:58:57 -0500 Subject: [PATCH 1/3] lisp comment parsing --- comment_parser/parsers/lisp_parser.py | 59 +++++++++++++++++++ .../parsers/tests/lisp_parser_test.py | 37 ++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 comment_parser/parsers/lisp_parser.py create mode 100644 comment_parser/parsers/tests/lisp_parser_test.py diff --git a/comment_parser/parsers/lisp_parser.py b/comment_parser/parsers/lisp_parser.py new file mode 100644 index 0000000..54a30bc --- /dev/null +++ b/comment_parser/parsers/lisp_parser.py @@ -0,0 +1,59 @@ +#!/usr/bin/python +"""This module provides support for parsing the Lisp family of languages + +Works with: + Lisp + Scheme + Racket + Clojure (not including the (comment) form + ... and other languages which use the leading ; as the comment form +""" + +import re +from bisect import bisect_left +from typing import List +from comment_parser.parsers import common + +def extract_comments(code: str) -> List[common.Comment]: + """Extracts a list of comments from a given Lisp family source code. + + Comments are represented with the Comment class found in the common module. + Lisp family comments come in a single form. Any string of characters begun with + `;` it is considered to be a comment. Note that various languages in the lisp + family use multiple `;` to denote certain types of comments. For example, a + comment using a single `;` may just mean an inline comment, but two (`;;`) or + more `;`'s may be considered official documentation. This parser does not + differentiate between the various types of comments, but will consume many `;` + characters and return the comment text + + Args: + code (str): String containing code to extract comments from. + Returns: + List[common.Comment]: list of comments in the order that they appear in the + code + """ + pattern = r""" + (?P (\"([^\"\n])*\")+) | + (?P ;+(?P.*)?$) + """ + + compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE) + + lines_indexes = [] + for match in re.finditer(r"$", code, re.M): + lines_indexes.append(match.start()) + + comments = [] + + for match in compiled.finditer(code): + kind = match.lastgroup + + start_character = match.start() + line_no = bisect_left(lines_indexes, start_character) + + if kind == "single": + comment_content = match.group("single_content") + comment = common.Comment(comment_content, line_no + 1) + comments.append(comment) + + return comments \ No newline at end of file diff --git a/comment_parser/parsers/tests/lisp_parser_test.py b/comment_parser/parsers/tests/lisp_parser_test.py new file mode 100644 index 0000000..4467661 --- /dev/null +++ b/comment_parser/parsers/tests/lisp_parser_test.py @@ -0,0 +1,37 @@ +#!/usr/bin/python +"""Tests for comment_parser.parsers.lisp_parser.py""" + +import unittest +from comment_parser.parsers import common +from comment_parser.parsers import lisp_parser + +class LispParerTest(unittest.TestCase): + + def testSimpleMain(self): + code = "; this is a comment\n(format t \"Hello, World!\")" + comments = lisp_parser.extract_comments(code) + expected = [common.Comment(code[1:19], 1, False)] + self.assertEqual(comments, expected) + + def testSingleLineComment(self): + code = "; single line comment" + comments = lisp_parser.extract_comments(code) + expected = [common.Comment(code[1:], 1, False)] + self.assertEqual(comments, expected) + + def testSingleLineCommentInStringLiteral(self): + code = '(format t "; this is not a comment")' + comments = lisp_parser.extract_comments(code) + self.assertEqual(comments, []) + + def testMultipleCommentCharacters(self): + code = ';; this is a comment' + comments = lisp_parser.extract_comments(code) + expected = [common.Comment(code[2:], 1, False)] + self.assertEqual(comments, expected) + + def testCommentsAfterLine(self): + code = '(t format "Hello World") ; this is a comment' + comments = lisp_parser.extract_comments(code) + expected = [common.Comment(' this is a comment', 1, False)] + self.assertEqual(comments, expected) \ No newline at end of file From e559fd5accbbf9be458e53c5fc2ca458914fd18f Mon Sep 17 00:00:00 2001 From: dhuck Date: Wed, 15 Mar 2023 20:14:52 -0500 Subject: [PATCH 2/3] added clojure and lisp --- comment_parser/comment_parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comment_parser/comment_parser.py b/comment_parser/comment_parser.py index 25c9b44..05bb41e 100755 --- a/comment_parser/comment_parser.py +++ b/comment_parser/comment_parser.py @@ -33,6 +33,7 @@ from comment_parser.parsers import python_parser from comment_parser.parsers import ruby_parser from comment_parser.parsers import shell_parser +from comment_parser.parsers import lisp_parser MIME_MAP = { 'application/javascript': js_parser, # Javascript @@ -48,6 +49,8 @@ 'text/x-script.python': python_parser, # Python 'text/x-shellscript': shell_parser, # Unix shell 'text/xml': html_parser, # XML + 'text/x-lisp': lisp_parser, # Lisp + 'text/x-clojure': lisp_parser, # Clojure } From 0a7d2e5fecc4d99d4c13a379351bebf85eb5519d Mon Sep 17 00:00:00 2001 From: dhuck Date: Wed, 15 Mar 2023 20:31:22 -0500 Subject: [PATCH 3/3] add racket and scheme --- comment_parser/comment_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comment_parser/comment_parser.py b/comment_parser/comment_parser.py index 05bb41e..d0e7177 100755 --- a/comment_parser/comment_parser.py +++ b/comment_parser/comment_parser.py @@ -51,6 +51,8 @@ 'text/xml': html_parser, # XML 'text/x-lisp': lisp_parser, # Lisp 'text/x-clojure': lisp_parser, # Clojure + 'text/x-racket': lisp_parser, # Racket + 'text/x-scheme': lisp_parser, # Scheme }