from __future__ import absolute_import, unicode_literals import re from commonmark import common from commonmark.common import unescape_string from commonmark.inlines import InlineParser from commonmark.node import Node CODE_INDENT = 4 reHtmlBlockOpen = [ re.compile(r'.'), # dummy for 0 re.compile(r'^<(?:script|pre|style)(?:\s|>|$)', re.IGNORECASE), re.compile(r'^<!--'), re.compile(r'^<[?]'), re.compile(r'^<![A-Z]'), re.compile(r'^<!\[CDATA\['), re.compile( r'^<[/]?(?:address|article|aside|base|basefont|blockquote|body|' r'caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|' r'fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|' r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|' r'nav|noframes|ol|optgroup|option|p|param|section|source|title|' r'summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)' r'(?:\s|[/]?[>]|$)', re.IGNORECASE), re.compile( '^(?:' + common.OPENTAG + '|' + common.CLOSETAG + ')\\s*$', re.IGNORECASE), ] reHtmlBlockClose = [ re.compile(r'.'), # dummy for 0 re.compile(r'<\/(?:script|pre|style)>', re.IGNORECASE), re.compile(r'-->'), re.compile(r'\?>'), re.compile(r'>'), re.compile(r'\]\]>'), ] reThematicBreak = re.compile( r'^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$') reMaybeSpecial = re.compile(r'^[#`~*+_=<>0-9-]') reNonSpace = re.compile(r'[^ \t\f\v\r\n]') reBulletListMarker = re.compile(r'^[*+-]') reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])') reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)') reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}') reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)') reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$') reLineEnding = re.compile(r'\r\n|\n|\r') def is_blank(s): """Returns True if string contains only space characters.""" return re.search(reNonSpace, s) is None def is_space_or_tab(s): return s in (' ', '\t') def peek(ln, pos): if pos < len(ln): return ln[pos] else: return None def ends_with_blank_line(block): """ Returns true if block ends with a blank line, descending if needed into lists and sublists.""" while block: if block.last_line_blank: return True if not block.last_line_checked and \ block.t in ('list', 'item'): block.last_line_checked = True block = block.last_child else: block.last_line_checked = True break return False def parse_list_marker(parser, container): """ Parse a list marker and return data on the marker (type, start, delimiter, bullet character, padding) or None.""" rest = parser.current_line[parser.next_nonspace:] data = { 'type': None, 'tight': True, # lists are tight by default 'bullet_char': None, 'start': None, 'delimiter': None, 'padding': None, 'marker_offset': parser.indent, } if parser.indent >= 4: return None m = re.search(reBulletListMarker, rest) m2 = re.search(reOrderedListMarker, rest) if m: data['type'] = 'bullet' data['bullet_char'] = m.group()[0] elif m2 and (container.t != 'paragraph' or m2.group(1) == '1'): m = m2 data['type'] = 'ordered' data['start'] = int(m.group(1)) data['delimiter'] = m.group(2) else: return None # make sure we have spaces after nextc = peek(parser.current_line, parser.next_nonspace + len(m.group())) if not (nextc is None or nextc == '\t' or nextc == ' '): return None # if it interrupts paragraph, make sure first line isn't blank if container.t == 'paragraph' and \ not re.search( reNonSpace, parser.current_line[parser.next_nonspace + len(m.group()):]): return None # we've got a match! advance offset and calculate padding parser.advance_next_nonspace() # to start of marker parser.advance_offset(len(m.group()), True) # to end of marker spaces_start_col = parser.column spaces_start_offset = parser.offset while True: parser.advance_offset(1, True) nextc = peek(parser.current_line, parser.offset) if parser.column - spaces_start_col < 5 and \ is_space_or_tab(nextc): pass else: break blank_item = peek(parser.current_line, parser.offset) is None spaces_after_marker = parser.column - spaces_start_col if spaces_after_marker >= 5 or \ spaces_after_marker < 1 or \ blank_item: data['padding'] = len(m.group()) + 1 parser.column = spaces_start_col parser.offset = spaces_start_offset if is_space_or_tab(peek(parser.current_line, parser.offset)): parser.advance_offset(1, True) else: data['padding'] = len(m.group()) + spaces_after_marker return data def lists_match(list_data, item_data): """ Returns True if the two list items are of the same type, with the same delimiter and bullet character. This is used in agglomerating list items into lists. """ return list_data.get('type') == item_data.get('type') and \ list_data.get('delimiter') == item_data.get('delimiter') and \ list_data.get('bullet_char') == item_data.get('bullet_char') class Block(object): accepts_lines = None @staticmethod def continue_(parser=None, container=None): return @staticmethod def finalize(parser=None, block=None): return @staticmethod def can_contain(t): return class Document(Block): accepts_lines = False @staticmethod def continue_(parser=None, container=None): return 0 @staticmethod def finalize(parser=None, block=None): return @staticmethod def can_contain(t): return t != 'item' class List(Block): accepts_lines = False @staticmethod def continue_(parser=None, container=None): return 0 @staticmethod def finalize(parser=None, block=None): item = block.first_child while item: # check for non-final list item ending with blank line: if ends_with_blank_line(item) and item.nxt: block.list_data['tight'] = False break # recurse into children of list item, to see if there are # spaces between any of them: subitem = item.first_child while subitem: if ends_with_blank_line(subitem) and \ (item.nxt or subitem.nxt): block.list_data['tight'] = False break subitem = subitem.nxt item = item.nxt @staticmethod def can_contain(t): return t == 'item' class BlockQuote(Block): accepts_lines = False @staticmethod def continue_(parser=None, container=None): ln = parser.current_line if not parser.indented and peek(ln, parser.next_nonspace) == '>': parser.advance_next_nonspace() parser.advance_offset(1, False) if is_space_or_tab(peek(ln, parser.offset)): parser.advance_offset(1, True) else: return 1 return 0 @staticmethod def finalize(parser=None, block=None): return @staticmethod def can_contain(t): return t != 'item' class Item(Block): accepts_lines = False @staticmethod def continue_(parser=None, container=None): if parser.blank: if container.first_child is None: # Blank line after empty list item return 1 else: parser.advance_next_nonspace() elif parser.indent >= (container.list_data['marker_offset'] + container.list_data['padding']): parser.advance_offset( container.list_data['marker_offset'] + container.list_data['padding'], True) else: return 1 return 0 @staticmethod def finalize(parser=None, block=None): return @staticmethod def can_contain(t): return t != 'item' class Heading(Block): accepts_lines = False @staticmethod def continue_(parser=None, container=None): # A heading can never container > 1 line, so fail to match: return 1 @staticmethod def finalize(parser=None, block=None): return @staticmethod def can_contain(t): return False class ThematicBreak(Block): accepts_lines = False @staticmethod def continue_(parser=None, container=None): # A thematic break can never container > 1 line, so fail to match: return 1 @staticmethod def finalize(parser=None, block=None): return @staticmethod def can_contain(t): return False class CodeBlock(Block): accepts_lines = True @staticmethod def continue_(parser=None, container=None): ln = parser.current_line indent = parser.indent if container.is_fenced: match = indent <= 3 and \ len(ln) >= parser.next_nonspace + 1 and \ ln[parser.next_nonspace] == container.fence_char and \ re.search(reClosingCodeFence, ln[parser.next_nonspace:]) if match and len(match.group()) >= container.fence_length: # closing fence - we're at end of line, so we can return parser.finalize(container, parser.line_number) return 2 else: # skip optional spaces of fence offset i = container.fence_offset while i > 0 and is_space_or_tab(peek(ln, parser.offset)): parser.advance_offset(1, True) i -= 1 else: # indented if indent >= CODE_INDENT: parser.advance_offset(CODE_INDENT, True) elif parser.blank: parser.advance_next_nonspace() else: return 1 return 0 @staticmethod def finalize(parser=None, block=None): if block.is_fenced: # first line becomes info string content = block.string_content newline_pos = content.index('\n') first_line = content[0:newline_pos] rest = content[newline_pos + 1:] block.info = unescape_string(first_line.strip()) block.literal = rest else: # indented block.literal = re.sub(r'(\n *)+$', '\n', block.string_content) block.string_content = None @staticmethod def can_contain(t): return False class HtmlBlock(Block): accepts_lines = True @staticmethod def continue_(parser=None, container=None): if parser.blank and (container.html_block_type == 6 or container.html_block_type == 7): return 1 else: return 0 @staticmethod def finalize(parser=None, block=None): block.literal = re.sub(r'(\n *)+$', '', block.string_content) # allow GC block.string_content = None @staticmethod def can_contain(t): return False class Paragraph(Block): accepts_lines = True @staticmethod def continue_(parser=None, container=None): return 1 if parser.blank else 0 @staticmethod def finalize(parser=None, block=None): has_reference_defs = False # try parsing the beginning as link reference definitions: while peek(block.string_content, 0) == '[': pos = parser.inline_parser.parseReference( block.string_content, parser.refmap) if not pos: break block.string_content = block.string_content[pos:] has_reference_defs = True if has_reference_defs and is_blank(block.string_content): block.unlink() @staticmethod def can_contain(t): return False class BlockStarts(object): """Block start functions. Return values: 0 = no match 1 = matched container, keep going 2 = matched leaf, no more block starts """ METHODS = [ 'block_quote', 'atx_heading', 'fenced_code_block', 'html_block', 'setext_heading', 'thematic_break', 'list_item', 'indented_code_block', ] @staticmethod def block_quote(parser, container=None): if not parser.indented and \ peek(parser.current_line, parser.next_nonspace) == '>': parser.advance_next_nonspace() parser.advance_offset(1, False) # optional following space if is_space_or_tab(peek(parser.current_line, parser.offset)): parser.advance_offset(1, True) parser.close_unmatched_blocks() parser.add_child('block_quote', parser.next_nonspace) return 1 return 0 @staticmethod def atx_heading(parser, container=None): if not parser.indented: m = re.search(reATXHeadingMarker, parser.current_line[parser.next_nonspace:]) if m: parser.advance_next_nonspace() parser.advance_offset(len(m.group()), False) parser.close_unmatched_blocks() container = parser.add_child('heading', parser.next_nonspace) # number of #s container.level = len(m.group().strip()) # remove trailing ###s: container.string_content = re.sub( r'[ \t]+#+[ \t]*$', '', re.sub( r'^[ \t]*#+[ \t]*$', '', parser.current_line[parser.offset:])) parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 return 0 @staticmethod def fenced_code_block(parser, container=None): if not parser.indented: m = re.search( reCodeFence, parser.current_line[parser.next_nonspace:]) if m: fence_length = len(m.group()) parser.close_unmatched_blocks() container = parser.add_child( 'code_block', parser.next_nonspace) container.is_fenced = True container.fence_length = fence_length container.fence_char = m.group()[0] container.fence_offset = parser.indent parser.advance_next_nonspace() parser.advance_offset(fence_length, False) return 2 return 0 @staticmethod def html_block(parser, container=None): if not parser.indented and \ peek(parser.current_line, parser.next_nonspace) == '<': s = parser.current_line[parser.next_nonspace:] for block_type in range(1, 8): if re.search(reHtmlBlockOpen[block_type], s) and \ (block_type < 7 or container.t != 'paragraph'): parser.close_unmatched_blocks() # We don't adjust parser.offset; # spaces are part of the HTML block: b = parser.add_child('html_block', parser.offset) b.html_block_type = block_type return 2 return 0 @staticmethod def setext_heading(parser, container=None): if not parser.indented and container.t == 'paragraph': m = re.search( reSetextHeadingLine, parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() # resolve reference link definitiosn while peek(container.string_content, 0) == '[': pos = parser.inline_parser.parseReference( container.string_content, parser.refmap) if not pos: break container.string_content = container.string_content[pos:] if container.string_content: heading = Node('heading', container.sourcepos) heading.level = 1 if m.group()[0] == '=' else 2 heading.string_content = container.string_content container.insert_after(heading) container.unlink() parser.tip = heading parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 else: return 0 return 0 @staticmethod def thematic_break(parser, container=None): if not parser.indented and re.search( reThematicBreak, parser.current_line[parser.next_nonspace:]): parser.close_unmatched_blocks() parser.add_child('thematic_break', parser.next_nonspace) parser.advance_offset( len(parser.current_line) - parser.offset, False) return 2 return 0 @staticmethod def list_item(parser, container=None): if (not parser.indented or container.t == 'list'): data = parse_list_marker(parser, container) if data: parser.close_unmatched_blocks() # add the list if needed if parser.tip.t != 'list' or \ not lists_match(container.list_data, data): container = parser.add_child('list', parser.next_nonspace) container.list_data = data # add the list item container = parser.add_child('item', parser.next_nonspace) container.list_data = data return 1 return 0 @staticmethod def indented_code_block(parser, container=None): if parser.indented and \ parser.tip.t != 'paragraph' and \ not parser.blank: # indented code parser.advance_offset(CODE_INDENT, True) parser.close_unmatched_blocks() parser.add_child('code_block', parser.offset) return 2 return 0 class Parser(object): def __init__(self, options={}): self.doc = Node('document', [[1, 1], [0, 0]]) self.block_starts = BlockStarts() self.tip = self.doc self.oldtip = self.doc self.current_line = '' self.line_number = 0 self.offset = 0 self.column = 0 self.next_nonspace = 0 self.next_nonspace_column = 0 self.indent = 0 self.indented = False self.blank = False self.partially_consumed_tab = False self.all_closed = True self.last_matched_container = self.doc self.refmap = {} self.last_line_length = 0 self.inline_parser = InlineParser(options) self.options = options def add_line(self): """ Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before calling this.""" if self.partially_consumed_tab: # Skip over tab self.offset += 1 # Add space characters chars_to_tab = 4 - (self.column % 4) self.tip.string_content += (' ' * chars_to_tab) self.tip.string_content += (self.current_line[self.offset:] + '\n') def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" while not self.blocks[self.tip.t].can_contain(tag): self.finalize(self.tip, self.line_number - 1) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block def close_unmatched_blocks(self): """Finalize and close any unmatched blocks.""" if not self.all_closed: while self.oldtip != self.last_matched_container: parent = self.oldtip.parent self.finalize(self.oldtip, self.line_number - 1) self.oldtip = parent self.all_closed = True def find_next_nonspace(self): current_line = self.current_line i = self.offset cols = self.column try: c = current_line[i] except IndexError: c = '' while c != '': if c == ' ': i += 1 cols += 1 elif c == '\t': i += 1 cols += (4 - (cols % 4)) else: break try: c = current_line[i] except IndexError: c = '' self.blank = (c == '\n' or c == '\r' or c == '') self.next_nonspace = i self.next_nonspace_column = cols self.indent = self.next_nonspace_column - self.column self.indented = self.indent >= CODE_INDENT def advance_next_nonspace(self): self.offset = self.next_nonspace self.column = self.next_nonspace_column self.partially_consumed_tab = False def advance_offset(self, count, columns): current_line = self.current_line try: c = current_line[self.offset] except IndexError: c = None while count > 0 and c is not None: if c == '\t': chars_to_tab = 4 - (self.column % 4) if columns: self.partially_consumed_tab = chars_to_tab > count chars_to_advance = min(count, chars_to_tab) self.column += chars_to_advance self.offset += 0 if self.partially_consumed_tab else 1 count -= chars_to_advance else: self.partially_consumed_tab = False self.column += chars_to_tab self.offset += 1 count -= 1 else: self.partially_consumed_tab = False self.offset += 1 # assume ascii; block starts are ascii self.column += 1 count -= 1 try: c = current_line[self.offset] except IndexError: c = None def incorporate_line(self, ln): """Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each line of input, then finalizing the document. """ all_matched = True container = self.doc self.oldtip = self.tip self.offset = 0 self.column = 0 self.blank = False self.partially_consumed_tab = False self.line_number += 1 # replace NUL characters for security if re.search(r'\u0000', ln) is not None: ln = re.sub(r'\0', '\uFFFD', ln) self.current_line = ln # For each containing block, try to parse the associated line start. # Bail out on failure: container will point to the last matching block. # Set all_matched to false if not all containers match. while True: last_child = container.last_child if not (last_child and last_child.is_open): break container = last_child self.find_next_nonspace() rv = self.blocks[container.t].continue_(self, container) if rv == 0: # we've matched, keep going pass elif rv == 1: # we've failed to match a block all_matched = False elif rv == 2: # we've hit end of line for fenced code close and can return self.last_line_length = len(ln) return else: raise ValueError( 'continue_ returned illegal value, must be 0, 1, or 2') if not all_matched: # back up to last matching block container = container.parent break self.all_closed = (container == self.oldtip) self.last_matched_container = container matched_leaf = container.t != 'paragraph' and \ self.blocks[container.t].accepts_lines starts = self.block_starts starts_len = len(starts.METHODS) # Unless last matched container is a code block, try new container # starts, adding children to the last matched container: while not matched_leaf: self.find_next_nonspace() # this is a little performance optimization: if not self.indented and \ not re.search(reMaybeSpecial, ln[self.next_nonspace:]): self.advance_next_nonspace() break i = 0 while i < starts_len: res = getattr(starts, starts.METHODS[i])(self, container) if res == 1: container = self.tip break elif res == 2: container = self.tip matched_leaf = True break else: i += 1 if i == starts_len: # nothing matched self.advance_next_nonspace() break # What remains at the offset is a text line. Add the text to the # appropriate container. if not self.all_closed and not self.blank and \ self.tip.t == 'paragraph': # lazy paragraph continuation self.add_line() else: # not a lazy continuation # finalize any blocks not matched self.close_unmatched_blocks() if self.blank and container.last_child: container.last_child.last_line_blank = True t = container.t # Block quote lines are never blank as they start with > # and we don't count blanks in fenced code for purposes of # tight/loose lists or breaking out of lists. We also # don't set last_line_blank on an empty list item, or if we # just closed a fenced block. last_line_blank = self.blank and \ not (t == 'block_quote' or (t == 'code_block' and container.is_fenced) or (t == 'item' and not container.first_child and container.sourcepos[0][0] == self.line_number)) # propagate last_line_blank up through parents: cont = container while cont: cont.last_line_blank = last_line_blank cont = cont.parent if self.blocks[t].accepts_lines: self.add_line() # if HtmlBlock, check for end condition if t == 'html_block' and \ container.html_block_type >= 1 and \ container.html_block_type <= 5 and \ re.search( reHtmlBlockClose[container.html_block_type], self.current_line[self.offset:]): self.finalize(container, self.line_number) elif self.offset < len(ln) and not self.blank: # create a paragraph container for one line container = self.add_child('paragraph', self.offset) self.advance_next_nonspace() self.add_line() self.last_line_length = len(ln) def finalize(self, block, line_number): """ Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference definitions. Reset the tip to the parent of the closed block.""" above = block.parent block.is_open = False block.sourcepos[1] = [line_number, self.last_line_length] self.blocks[block.t].finalize(self, block) self.tip = above def process_inlines(self, block): """ Walk through a block & children recursively, parsing string content into inline content where appropriate. """ walker = block.walker() self.inline_parser.refmap = self.refmap self.inline_parser.options = self.options event = walker.nxt() while event is not None: node = event['node'] t = node.t if not event['entering'] and (t == 'paragraph' or t == 'heading'): self.inline_parser.parse(node) event = walker.nxt() def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node('document', [[1, 1], [0, 0]]) self.tip = self.doc self.refmap = {} self.line_number = 0 self.last_line_length = 0 self.offset = 0 self.column = 0 self.last_matched_container = self.doc self.current_line = '' lines = re.split(reLineEnding, my_input) length = len(lines) if len(my_input) > 0 and my_input[-1] == '\n': # ignore last blank line created by final newline length -= 1 for i in range(length): self.incorporate_line(lines[i]) while (self.tip): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc CAMEL_RE = re.compile("(.)([A-Z](?:[a-z]+|(?<=[a-z0-9].)))") Parser.blocks = dict( (CAMEL_RE.sub(r'\1_\2', cls.__name__).lower(), cls) for cls in Block.__subclasses__())