139 lines
4.8 KiB
Python
139 lines
4.8 KiB
Python
|
#!/usr/bin/python3
|
||
|
|
||
|
"""Lexical analysis of formal languages (i.e. code) using Pygments."""
|
||
|
|
||
|
# :Author: Georg Brandl; Felix Wiemann; Günter Milde
|
||
|
# :Date: $Date: 2022-03-04 16:57:13 +0100 (Fr, 04. Mär 2022) $
|
||
|
# :Copyright: This module has been placed in the public domain.
|
||
|
|
||
|
from docutils import ApplicationError
|
||
|
try:
|
||
|
import pygments
|
||
|
from pygments.lexers import get_lexer_by_name
|
||
|
from pygments.formatters.html import _get_ttype_class
|
||
|
with_pygments = True
|
||
|
except ImportError:
|
||
|
with_pygments = False
|
||
|
|
||
|
# Filter the following token types from the list of class arguments:
|
||
|
unstyled_tokens = ['token', # Token (base token type)
|
||
|
'text', # Token.Text
|
||
|
''] # short name for Token and Text
|
||
|
# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
|
||
|
|
||
|
|
||
|
class LexerError(ApplicationError):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class Lexer:
|
||
|
"""Parse `code` lines and yield "classified" tokens.
|
||
|
|
||
|
Arguments
|
||
|
|
||
|
code -- string of source code to parse,
|
||
|
language -- formal language the code is written in,
|
||
|
tokennames -- either 'long', 'short', or 'none' (see below).
|
||
|
|
||
|
Merge subsequent tokens of the same token-type.
|
||
|
|
||
|
Iterating over an instance yields the tokens as ``(tokentype, value)``
|
||
|
tuples. The value of `tokennames` configures the naming of the tokentype:
|
||
|
|
||
|
'long': downcased full token type name,
|
||
|
'short': short name defined by pygments.token.STANDARD_TYPES
|
||
|
(= class argument used in pygments html output),
|
||
|
'none': skip lexical analysis.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, code, language, tokennames='short'):
|
||
|
"""
|
||
|
Set up a lexical analyzer for `code` in `language`.
|
||
|
"""
|
||
|
self.code = code
|
||
|
self.language = language
|
||
|
self.tokennames = tokennames
|
||
|
self.lexer = None
|
||
|
# get lexical analyzer for `language`:
|
||
|
if language in ('', 'text') or tokennames == 'none':
|
||
|
return
|
||
|
if not with_pygments:
|
||
|
raise LexerError('Cannot analyze code. '
|
||
|
'Pygments package not found.')
|
||
|
try:
|
||
|
self.lexer = get_lexer_by_name(self.language)
|
||
|
except pygments.util.ClassNotFound:
|
||
|
raise LexerError('Cannot analyze code. '
|
||
|
'No Pygments lexer found for "%s".' % language)
|
||
|
# self.lexer.add_filter('tokenmerge')
|
||
|
# Since version 1.2. (released Jan 01, 2010) Pygments has a
|
||
|
# TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
|
||
|
# be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
|
||
|
# However, `merge` below also strips a final newline added by pygments.
|
||
|
#
|
||
|
# self.lexer.add_filter('tokenmerge')
|
||
|
|
||
|
def merge(self, tokens):
|
||
|
"""Merge subsequent tokens of same token-type.
|
||
|
|
||
|
Also strip the final newline (added by pygments).
|
||
|
"""
|
||
|
tokens = iter(tokens)
|
||
|
(lasttype, lastval) = next(tokens)
|
||
|
for ttype, value in tokens:
|
||
|
if ttype is lasttype:
|
||
|
lastval += value
|
||
|
else:
|
||
|
yield lasttype, lastval
|
||
|
(lasttype, lastval) = (ttype, value)
|
||
|
if lastval.endswith('\n'):
|
||
|
lastval = lastval[:-1]
|
||
|
if lastval:
|
||
|
yield lasttype, lastval
|
||
|
|
||
|
def __iter__(self):
|
||
|
"""Parse self.code and yield "classified" tokens.
|
||
|
"""
|
||
|
if self.lexer is None:
|
||
|
yield [], self.code
|
||
|
return
|
||
|
tokens = pygments.lex(self.code, self.lexer)
|
||
|
for tokentype, value in self.merge(tokens):
|
||
|
if self.tokennames == 'long': # long CSS class args
|
||
|
classes = str(tokentype).lower().split('.')
|
||
|
else: # short CSS class args
|
||
|
classes = [_get_ttype_class(tokentype)]
|
||
|
classes = [cls for cls in classes if cls not in unstyled_tokens]
|
||
|
yield classes, value
|
||
|
|
||
|
|
||
|
class NumberLines:
|
||
|
"""Insert linenumber-tokens at the start of every code line.
|
||
|
|
||
|
Arguments
|
||
|
|
||
|
tokens -- iterable of ``(classes, value)`` tuples
|
||
|
startline -- first line number
|
||
|
endline -- last line number
|
||
|
|
||
|
Iterating over an instance yields the tokens with a
|
||
|
``(['ln'], '<the line number>')`` token added for every code line.
|
||
|
Multi-line tokens are split."""
|
||
|
|
||
|
def __init__(self, tokens, startline, endline):
|
||
|
self.tokens = tokens
|
||
|
self.startline = startline
|
||
|
# pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
|
||
|
self.fmt_str = '%%%dd ' % len(str(endline))
|
||
|
|
||
|
def __iter__(self):
|
||
|
lineno = self.startline
|
||
|
yield ['ln'], self.fmt_str % lineno
|
||
|
for ttype, value in self.tokens:
|
||
|
lines = value.split('\n')
|
||
|
for line in lines[:-1]:
|
||
|
yield ttype, line + '\n'
|
||
|
lineno += 1
|
||
|
yield ['ln'], self.fmt_str % lineno
|
||
|
yield ttype, lines[-1]
|