diff --git a/comment_parser/parsers/c_parser.py b/comment_parser/parsers/c_parser.py index c7f51ec..fc2fa02 100644 --- a/comment_parser/parsers/c_parser.py +++ b/comment_parser/parsers/c_parser.py @@ -60,8 +60,12 @@ def extract_comments(code: str) -> List[common.Comment]: comment = common.Comment(comment_content, line_no + 1) comments.append(comment) elif kind == "multi": + end_character = match.end() + line_no_end = bisect_left(lines_indexes, end_character) + line_no = [x for x in range(line_no, line_no_end+1)] + line_no = [x+1 for x in line_no] comment_content = match.group("multi_content") - comment = common.Comment(comment_content, line_no + 1, multiline=True) + comment = common.Comment(comment_content, line_no, multiline=True) comments.append(comment) elif kind == "error": raise common.UnterminatedCommentError() diff --git a/comment_parser/parsers/common.py b/comment_parser/parsers/common.py index 15f72c7..6598d1f 100644 --- a/comment_parser/parsers/common.py +++ b/comment_parser/parsers/common.py @@ -1,5 +1,6 @@ #!/usr/bin/python """This module provides constructs common to all comment parsers.""" +from typing import Union, List class Error(Exception): @@ -17,12 +18,12 @@ class UnterminatedCommentError(Error): class Comment(): """Represents comments found in source files.""" - def __init__(self, text: str, line_number: int, multiline: bool = False): + def __init__(self, text: str, line_number: Union[int, List], multiline: bool = False): """Initializes Comment. Args: text: String text of comment. - line_number: Line number (int) comment was found on. + line_number: Line number (int or List) comment was found on. multiline: Boolean whether this comment was a multiline comment. """ self._text = text @@ -37,11 +38,11 @@ def text(self) -> str: """ return self._text - def line_number(self) -> int: + def line_number(self) -> Union[int, List]: """Returns the line number the comment was found on. Returns: - Int + Union[int, List] """ return self._line_number diff --git a/comment_parser/parsers/html_parser.py b/comment_parser/parsers/html_parser.py index 6bb58f0..13afbc5 100644 --- a/comment_parser/parsers/html_parser.py +++ b/comment_parser/parsers/html_parser.py @@ -51,8 +51,12 @@ def extract_comments(code: str) -> List[common.Comment]: comment = common.Comment(comment_content, line_no + 1) comments.append(comment) elif kind == "multi": + end_character = match.end() + line_no_end = bisect_left(lines_indexes, end_character) + line_no = [x for x in range(line_no, line_no_end+1)] + line_no = [x+1 for x in line_no] comment_content = match.group("multi_content") - comment = common.Comment(comment_content, line_no + 1, multiline=True) + comment = common.Comment(comment_content, line_no, multiline=True) comments.append(comment) elif kind == "error": raise common.UnterminatedCommentError() diff --git a/comment_parser/parsers/python_parser.py b/comment_parser/parsers/python_parser.py index f354d9a..9ebc914 100644 --- a/comment_parser/parsers/python_parser.py +++ b/comment_parser/parsers/python_parser.py @@ -3,15 +3,32 @@ import io import tokenize -from typing import List from comment_parser.parsers import common -def extract_comments(code: str) -> List[common.Comment]: +def extract_comments(code): """Extracts a list of comments from the given Python script. - Comments are identified using the tokenize module. Does not include function, - class, or module docstrings. All comments are single line comments. + Comments are identified using the tokenize module. + - Single-lined comments which begin with the '#' character and end with a line-break. + - Multi-lined comments or docstrings, which are just triple-quoted strings (start + and end with ''' or 3 of these "), are told apart from regular strings by the + type of the previous token which should be a line-break or an indentation (NEWLINE, + NL, INDENT or DEDENT) or no token at all (it would mean it's the first thing in + the script). Even in cases like this: + + my_string = \ + '''this should not be considered a comment''' + + my_string = \ + '''this should not either''' # <- notice the increasing indentation + + my_string = \ + '''weird syntax anyway''' # <- but still valid indentation + + the previous token to the string is the '=' operator and not a line-break or an + indentation. That way, only triple-quoted strings preceded by a line-break, an + indentation, or no token, will be considered intended as comments. Args: code: String containing code to extract comments from. @@ -20,11 +37,28 @@ def extract_comments(code: str) -> List[common.Comment]: Raises: tokenize.TokenError """ + triplequotes = ['"""', "'''"] + multicommprevnums = [ + tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.INDENT, + tokenize.DEDENT + ] + prevtoknum = None # Stores the previous token's type. comments = [] tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline) - for toknum, tokstring, tokloc, _, _ in tokens: + for toknum, tokstring, tokloc_start, tokloc_end, _ in tokens: + # Single-lined comment. if toknum is tokenize.COMMENT: # Removes leading '#' character. tokstring = tokstring[1:] - comments.append(common.Comment(tokstring, tokloc[0], False)) - return comments + comments.append(common.Comment(tokstring, tokloc_start[0], False)) + continue + # Multi-lined comment. + if toknum is tokenize.STRING: + if tokstring[:3] in triplequotes and tokstring[-3:] in triplequotes: + if (not prevtoknum) or prevtoknum in multicommprevnums: + # Removes the leading and preceding 3ple quotes (""" or '''). + tokstring = tokstring[3:-3] + line_no = [x for x in range(tokloc_start[0], tokloc_end[0]+1)] + comments.append(common.Comment(tokstring, line_no, True)) + prevtoknum = toknum + return comments \ No newline at end of file diff --git a/comment_parser/parsers/tests/python_parser_test.py b/comment_parser/parsers/tests/python_parser_test.py index f6f1a38..3ac0603 100644 --- a/comment_parser/parsers/tests/python_parser_test.py +++ b/comment_parser/parsers/tests/python_parser_test.py @@ -45,3 +45,47 @@ def testEscapedDoubleQuote(self): comments = python_parser.extract_comments(code) expected = [common.Comment(code[3:], 1, multiline=False)] self.assertEqual(comments, expected) + + def testTripleQuoteCommentsDoubleQuoteMultiline(self): + code = '"""this is triple quote comment\n' + code += 'with\n' + code += 'multiple\n' + code += 'lines\n' + code += '"""' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('"'), 1, multiline=True)] + self.assertEqual(comments, expected) + + def testTripleQuoteCommentsDoubleQuoteSingleline(self): + code = '"""this is triple quote comment"""' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('"'), 1, multiline=True)] + self.assertEqual(comments, expected) + + def testTripleQuoteCommentsSingleQuoteMultiline(self): + code = '\'\'\'this is triple quote comment\n' + code += 'with\n' + code += 'multiple\n' + code += 'lines\n' + code += '\'\'\'' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('\''), 1, multiline=True)] + self.assertEqual(comments, expected) + + def testTripleQuoteCommentsSingleQuoteSingleline(self): + code = '\'\'\'this is triple quote comment\'\'\'' + comments = python_parser.extract_comments(code) + import logging + logging.warning(comments) + logging.warning(code) + expected = [common.Comment(code.strip('\''), 1, multiline=True)] + self.assertEqual(comments, expected) \ No newline at end of file