Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion comment_parser/parsers/c_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ def extract_comments(code: str) -> List[common.Comment]:
comment = common.Comment(comment_content, line_no + 1)
comments.append(comment)
elif kind == "multi":
end_character = match.end()
line_no_end = bisect_left(lines_indexes, end_character)
line_no = [x for x in range(line_no, line_no_end+1)]
line_no = [x+1 for x in line_no]
comment_content = match.group("multi_content")
comment = common.Comment(comment_content, line_no + 1, multiline=True)
comment = common.Comment(comment_content, line_no, multiline=True)
comments.append(comment)
elif kind == "error":
raise common.UnterminatedCommentError()
Expand Down
9 changes: 5 additions & 4 deletions comment_parser/parsers/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/python
"""This module provides constructs common to all comment parsers."""
from typing import Union, List


class Error(Exception):
Expand All @@ -17,12 +18,12 @@ class UnterminatedCommentError(Error):
class Comment():
"""Represents comments found in source files."""

def __init__(self, text: str, line_number: int, multiline: bool = False):
def __init__(self, text: str, line_number: Union[int, List], multiline: bool = False):
"""Initializes Comment.

Args:
text: String text of comment.
line_number: Line number (int) comment was found on.
line_number: Line number (int or List) comment was found on.
multiline: Boolean whether this comment was a multiline comment.
"""
self._text = text
Expand All @@ -37,11 +38,11 @@ def text(self) -> str:
"""
return self._text

def line_number(self) -> int:
def line_number(self) -> Union[int, List]:
"""Returns the line number the comment was found on.

Returns:
Int
Union[int, List]
"""
return self._line_number

Expand Down
6 changes: 5 additions & 1 deletion comment_parser/parsers/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ def extract_comments(code: str) -> List[common.Comment]:
comment = common.Comment(comment_content, line_no + 1)
comments.append(comment)
elif kind == "multi":
end_character = match.end()
line_no_end = bisect_left(lines_indexes, end_character)
line_no = [x for x in range(line_no, line_no_end+1)]
line_no = [x+1 for x in line_no]
comment_content = match.group("multi_content")
comment = common.Comment(comment_content, line_no + 1, multiline=True)
comment = common.Comment(comment_content, line_no, multiline=True)
comments.append(comment)
elif kind == "error":
raise common.UnterminatedCommentError()
Expand Down
48 changes: 41 additions & 7 deletions comment_parser/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,32 @@

import io
import tokenize
from typing import List
from comment_parser.parsers import common


def extract_comments(code: str) -> List[common.Comment]:
def extract_comments(code):
"""Extracts a list of comments from the given Python script.

Comments are identified using the tokenize module. Does not include function,
class, or module docstrings. All comments are single line comments.
Comments are identified using the tokenize module.
- Single-lined comments which begin with the '#' character and end with a line-break.
- Multi-lined comments or docstrings, which are just triple-quoted strings (start
and end with ''' or 3 of these "), are told apart from regular strings by the
type of the previous token which should be a line-break or an indentation (NEWLINE,
NL, INDENT or DEDENT) or no token at all (it would mean it's the first thing in
the script). Even in cases like this:

my_string = \
'''this should not be considered a comment'''

my_string = \
'''this should not either''' # <- notice the increasing indentation

my_string = \
'''weird syntax anyway''' # <- but still valid indentation

the previous token to the string is the '=' operator and not a line-break or an
indentation. That way, only triple-quoted strings preceded by a line-break, an
indentation, or no token, will be considered intended as comments.

Args:
code: String containing code to extract comments from.
Expand All @@ -20,11 +37,28 @@ def extract_comments(code: str) -> List[common.Comment]:
Raises:
tokenize.TokenError
"""
triplequotes = ['"""', "'''"]
multicommprevnums = [
tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.INDENT,
tokenize.DEDENT
]
prevtoknum = None # Stores the previous token's type.
comments = []
tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline)
for toknum, tokstring, tokloc, _, _ in tokens:
for toknum, tokstring, tokloc_start, tokloc_end, _ in tokens:
# Single-lined comment.
if toknum is tokenize.COMMENT:
# Removes leading '#' character.
tokstring = tokstring[1:]
comments.append(common.Comment(tokstring, tokloc[0], False))
return comments
comments.append(common.Comment(tokstring, tokloc_start[0], False))
continue
# Multi-lined comment.
if toknum is tokenize.STRING:
if tokstring[:3] in triplequotes and tokstring[-3:] in triplequotes:
if (not prevtoknum) or prevtoknum in multicommprevnums:
# Removes the leading and preceding 3ple quotes (""" or ''').
tokstring = tokstring[3:-3]
line_no = [x for x in range(tokloc_start[0], tokloc_end[0]+1)]
comments.append(common.Comment(tokstring, line_no, True))
prevtoknum = toknum
return comments
44 changes: 44 additions & 0 deletions comment_parser/parsers/tests/python_parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,47 @@ def testEscapedDoubleQuote(self):
comments = python_parser.extract_comments(code)
expected = [common.Comment(code[3:], 1, multiline=False)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsDoubleQuoteMultiline(self):
code = '"""this is triple quote comment\n'
code += 'with\n'
code += 'multiple\n'
code += 'lines\n'
code += '"""'
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('"'), 1, multiline=True)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsDoubleQuoteSingleline(self):
code = '"""this is triple quote comment"""'
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('"'), 1, multiline=True)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsSingleQuoteMultiline(self):
code = '\'\'\'this is triple quote comment\n'
code += 'with\n'
code += 'multiple\n'
code += 'lines\n'
code += '\'\'\''
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('\''), 1, multiline=True)]
self.assertEqual(comments, expected)

def testTripleQuoteCommentsSingleQuoteSingleline(self):
code = '\'\'\'this is triple quote comment\'\'\''
comments = python_parser.extract_comments(code)
import logging
logging.warning(comments)
logging.warning(code)
expected = [common.Comment(code.strip('\''), 1, multiline=True)]
self.assertEqual(comments, expected)