jeanralphaviles · zdaiot · Mar 29, 2023 · Mar 29, 2023
diff --git a/comment_parser/parsers/c_parser.py b/comment_parser/parsers/c_parser.py
@@ -60,8 +60,12 @@ def extract_comments(code: str) -> List[common.Comment]:
       comment = common.Comment(comment_content, line_no + 1)
       comments.append(comment)
     elif kind == "multi":
+      end_character = match.end()
+      line_no_end = bisect_left(lines_indexes, end_character)
+      line_no = [x for x in range(line_no, line_no_end+1)]
+      line_no = [x+1 for x in line_no]
       comment_content = match.group("multi_content")
-      comment = common.Comment(comment_content, line_no + 1, multiline=True)
+      comment = common.Comment(comment_content, line_no, multiline=True)
       comments.append(comment)
     elif kind == "error":
       raise common.UnterminatedCommentError()

diff --git a/comment_parser/parsers/common.py b/comment_parser/parsers/common.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 """This module provides constructs common to all comment parsers."""
+from typing import Union, List
 
 
 class Error(Exception):
@@ -17,12 +18,12 @@ class UnterminatedCommentError(Error):
 class Comment():
   """Represents comments found in source files."""
 
-  def __init__(self, text: str, line_number: int, multiline: bool = False):
+  def __init__(self, text: str, line_number: Union[int, List], multiline: bool = False):
     """Initializes Comment.
 
     Args:
       text: String text of comment.
-      line_number: Line number (int) comment was found on.
+      line_number: Line number (int or List) comment was found on.
       multiline: Boolean whether this comment was a multiline comment.
     """
     self._text = text
@@ -37,11 +38,11 @@ def text(self) -> str:
     """
     return self._text
 
-  def line_number(self) -> int:
+  def line_number(self) -> Union[int, List]:
     """Returns the line number the comment was found on.
 
     Returns:
-      Int
+      Union[int, List]
     """
     return self._line_number
 

diff --git a/comment_parser/parsers/html_parser.py b/comment_parser/parsers/html_parser.py
@@ -51,8 +51,12 @@ def extract_comments(code: str) -> List[common.Comment]:
       comment = common.Comment(comment_content, line_no + 1)
       comments.append(comment)
     elif kind == "multi":
+      end_character = match.end()
+      line_no_end = bisect_left(lines_indexes, end_character)
+      line_no = [x for x in range(line_no, line_no_end+1)]
+      line_no = [x+1 for x in line_no]
       comment_content = match.group("multi_content")
-      comment = common.Comment(comment_content, line_no + 1, multiline=True)
+      comment = common.Comment(comment_content, line_no, multiline=True)
       comments.append(comment)
     elif kind == "error":
       raise common.UnterminatedCommentError()

diff --git a/comment_parser/parsers/python_parser.py b/comment_parser/parsers/python_parser.py
@@ -3,15 +3,32 @@
 
 import io
 import tokenize
-from typing import List
 from comment_parser.parsers import common
 
 
-def extract_comments(code: str) -> List[common.Comment]:
+def extract_comments(code):
   """Extracts a list of comments from the given Python script.
 
-  Comments are identified using the tokenize module. Does not include function,
-  class, or module docstrings. All comments are single line comments.
+  Comments are identified using the tokenize module.
+    - Single-lined comments which begin with the '#' character and end with a line-break.
+    - Multi-lined comments or docstrings, which are just triple-quoted strings (start
+      and end with ''' or 3 of these "), are told apart from regular strings by the
+      type of the previous token which should be a line-break or an indentation (NEWLINE,
+      NL, INDENT or DEDENT) or no token at all (it would mean it's the first thing in
+      the script). Even in cases like this:
+
+        my_string = \
+        '''this should not be considered a comment'''
+
+        my_string = \
+          '''this should not either''' # <- notice the increasing indentation
+
+        my_string = \
+            '''weird syntax anyway''' # <- but still valid indentation
+
+      the previous token to the string is the '=' operator and not a line-break or an
+      indentation. That way, only triple-quoted strings preceded by a line-break, an
+      indentation, or no token, will be considered intended as comments.
 
   Args:
     code: String containing code to extract comments from.
@@ -20,11 +37,28 @@ def extract_comments(code: str) -> List[common.Comment]:
   Raises:
     tokenize.TokenError
   """
+  triplequotes = ['"""', "'''"]
+  multicommprevnums = [
+      tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.INDENT,
+      tokenize.DEDENT
+  ]
+  prevtoknum = None  # Stores the previous token's type.
   comments = []
   tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline)
-  for toknum, tokstring, tokloc, _, _ in tokens:
+  for toknum, tokstring, tokloc_start, tokloc_end, _ in tokens:
+    # Single-lined comment.
     if toknum is tokenize.COMMENT:
       # Removes leading '#' character.
       tokstring = tokstring[1:]
-      comments.append(common.Comment(tokstring, tokloc[0], False))
-  return comments
+      comments.append(common.Comment(tokstring, tokloc_start[0], False))
+      continue
+    # Multi-lined comment.
+    if toknum is tokenize.STRING:
+      if tokstring[:3] in triplequotes and tokstring[-3:] in triplequotes:
+        if (not prevtoknum) or prevtoknum in multicommprevnums:
+          # Removes the leading and preceding 3ple quotes (""" or ''').
+          tokstring = tokstring[3:-3]
+          line_no = [x for x in range(tokloc_start[0], tokloc_end[0]+1)]
+          comments.append(common.Comment(tokstring, line_no, True))
+    prevtoknum = toknum
+  return comments
diff --git a/comment_parser/parsers/tests/python_parser_test.py b/comment_parser/parsers/tests/python_parser_test.py
@@ -45,3 +45,47 @@ def testEscapedDoubleQuote(self):
     comments = python_parser.extract_comments(code)
     expected = [common.Comment(code[3:], 1, multiline=False)]
     self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsDoubleQuoteMultiline(self):
+    code = '"""this is triple quote comment\n'
+    code += 'with\n'
+    code += 'multiple\n'
+    code += 'lines\n'
+    code += '"""'
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('"'), 1, multiline=True)]
+    self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsDoubleQuoteSingleline(self):
+    code = '"""this is triple quote comment"""'
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('"'), 1, multiline=True)]
+    self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsSingleQuoteMultiline(self):
+    code = '\'\'\'this is triple quote comment\n'
+    code += 'with\n'
+    code += 'multiple\n'
+    code += 'lines\n'
+    code += '\'\'\''
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('\''), 1, multiline=True)]
+    self.assertEqual(comments, expected)
+
+  def testTripleQuoteCommentsSingleQuoteSingleline(self):
+    code = '\'\'\'this is triple quote comment\'\'\''
+    comments = python_parser.extract_comments(code)
+    import logging
+    logging.warning(comments)
+    logging.warning(code)
+    expected = [common.Comment(code.strip('\''), 1, multiline=True)]
+    self.assertEqual(comments, expected)