usse/funda-scraper/venv/lib/python3.10/site-packages/pygments/lexers/html.py

605 lines
19 KiB
Python
Raw Normal View History

2023-02-20 22:38:24 +00:00
"""
pygments.lexers.html
~~~~~~~~~~~~~~~~~~~~
Lexers for HTML, XML and related markup.
:copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
from pygments.lexer import RegexLexer, ExtendedRegexLexer, include, bygroups, \
default, using
from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
Punctuation
from pygments.util import looks_like_xml, html_doctype_matches
from pygments.lexers.javascript import JavascriptLexer
from pygments.lexers.jvm import ScalaLexer
from pygments.lexers.css import CssLexer, _indentation, _starts_block
from pygments.lexers.ruby import RubyLexer
__all__ = ['HtmlLexer', 'DtdLexer', 'XmlLexer', 'XsltLexer', 'HamlLexer',
'ScamlLexer', 'PugLexer']
class HtmlLexer(RegexLexer):
"""
For HTML 4 and XHTML 1 markup. Nested JavaScript and CSS is highlighted
by the appropriate lexer.
"""
name = 'HTML'
url = 'https://html.spec.whatwg.org/'
aliases = ['html']
filenames = ['*.html', '*.htm', '*.xhtml', '*.xslt']
mimetypes = ['text/html', 'application/xhtml+xml']
flags = re.IGNORECASE | re.DOTALL
tokens = {
'root': [
('[^<&]+', Text),
(r'&\S*?;', Name.Entity),
(r'\<\!\[CDATA\[.*?\]\]\>', Comment.Preproc),
(r'<!--.*?-->', Comment.Multiline),
(r'<\?.*?\?>', Comment.Preproc),
('<![^>]*>', Comment.Preproc),
(r'(<)(\s*)(script)(\s*)',
bygroups(Punctuation, Text, Name.Tag, Text),
('script-content', 'tag')),
(r'(<)(\s*)(style)(\s*)',
bygroups(Punctuation, Text, Name.Tag, Text),
('style-content', 'tag')),
# note: this allows tag names not used in HTML like <x:with-dash>,
# this is to support yet-unknown template engines and the like
(r'(<)(\s*)([\w:.-]+)',
bygroups(Punctuation, Text, Name.Tag), 'tag'),
(r'(<)(\s*)(/)(\s*)([\w:.-]+)(\s*)(>)',
bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
Punctuation)),
],
'tag': [
(r'\s+', Text),
(r'([\w:-]+\s*)(=)(\s*)', bygroups(Name.Attribute, Operator, Text),
'attr'),
(r'[\w:-]+', Name.Attribute),
(r'(/?)(\s*)(>)', bygroups(Punctuation, Text, Punctuation), '#pop'),
],
'script-content': [
(r'(<)(\s*)(/)(\s*)(script)(\s*)(>)',
bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
Punctuation), '#pop'),
(r'.+?(?=<\s*/\s*script\s*>)', using(JavascriptLexer)),
# fallback cases for when there is no closing script tag
# first look for newline and then go back into root state
# if that fails just read the rest of the file
# this is similar to the error handling logic in lexer.py
(r'.+?\n', using(JavascriptLexer), '#pop'),
(r'.+', using(JavascriptLexer), '#pop'),
],
'style-content': [
(r'(<)(\s*)(/)(\s*)(style)(\s*)(>)',
bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
Punctuation),'#pop'),
(r'.+?(?=<\s*/\s*style\s*>)', using(CssLexer)),
# fallback cases for when there is no closing style tag
# first look for newline and then go back into root state
# if that fails just read the rest of the file
# this is similar to the error handling logic in lexer.py
(r'.+?\n', using(CssLexer), '#pop'),
(r'.+', using(CssLexer), '#pop'),
],
'attr': [
('".*?"', String, '#pop'),
("'.*?'", String, '#pop'),
(r'[^\s>]+', String, '#pop'),
],
}
def analyse_text(text):
if html_doctype_matches(text):
return 0.5
class DtdLexer(RegexLexer):
"""
A lexer for DTDs (Document Type Definitions).
.. versionadded:: 1.5
"""
flags = re.MULTILINE | re.DOTALL
name = 'DTD'
aliases = ['dtd']
filenames = ['*.dtd']
mimetypes = ['application/xml-dtd']
tokens = {
'root': [
include('common'),
(r'(<!ELEMENT)(\s+)(\S+)',
bygroups(Keyword, Text, Name.Tag), 'element'),
(r'(<!ATTLIST)(\s+)(\S+)',
bygroups(Keyword, Text, Name.Tag), 'attlist'),
(r'(<!ENTITY)(\s+)(\S+)',
bygroups(Keyword, Text, Name.Entity), 'entity'),
(r'(<!NOTATION)(\s+)(\S+)',
bygroups(Keyword, Text, Name.Tag), 'notation'),
(r'(<!\[)([^\[\s]+)(\s*)(\[)', # conditional sections
bygroups(Keyword, Name.Entity, Text, Keyword)),
(r'(<!DOCTYPE)(\s+)([^>\s]+)',
bygroups(Keyword, Text, Name.Tag)),
(r'PUBLIC|SYSTEM', Keyword.Constant),
(r'[\[\]>]', Keyword),
],
'common': [
(r'\s+', Text),
(r'(%|&)[^;]*;', Name.Entity),
('<!--', Comment, 'comment'),
(r'[(|)*,?+]', Operator),
(r'"[^"]*"', String.Double),
(r'\'[^\']*\'', String.Single),
],
'comment': [
('[^-]+', Comment),
('-->', Comment, '#pop'),
('-', Comment),
],
'element': [
include('common'),
(r'EMPTY|ANY|#PCDATA', Keyword.Constant),
(r'[^>\s|()?+*,]+', Name.Tag),
(r'>', Keyword, '#pop'),
],
'attlist': [
include('common'),
(r'CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION',
Keyword.Constant),
(r'#REQUIRED|#IMPLIED|#FIXED', Keyword.Constant),
(r'xml:space|xml:lang', Keyword.Reserved),
(r'[^>\s|()?+*,]+', Name.Attribute),
(r'>', Keyword, '#pop'),
],
'entity': [
include('common'),
(r'SYSTEM|PUBLIC|NDATA', Keyword.Constant),
(r'[^>\s|()?+*,]+', Name.Entity),
(r'>', Keyword, '#pop'),
],
'notation': [
include('common'),
(r'SYSTEM|PUBLIC', Keyword.Constant),
(r'[^>\s|()?+*,]+', Name.Attribute),
(r'>', Keyword, '#pop'),
],
}
def analyse_text(text):
if not looks_like_xml(text) and \
('<!ELEMENT' in text or '<!ATTLIST' in text or '<!ENTITY' in text):
return 0.8
class XmlLexer(RegexLexer):
"""
Generic lexer for XML (eXtensible Markup Language).
"""
flags = re.MULTILINE | re.DOTALL
name = 'XML'
aliases = ['xml']
filenames = ['*.xml', '*.xsl', '*.rss', '*.xslt', '*.xsd',
'*.wsdl', '*.wsf']
mimetypes = ['text/xml', 'application/xml', 'image/svg+xml',
'application/rss+xml', 'application/atom+xml']
tokens = {
'root': [
('[^<&]+', Text),
(r'&\S*?;', Name.Entity),
(r'\<\!\[CDATA\[.*?\]\]\>', Comment.Preproc),
(r'<!--.*?-->', Comment.Multiline),
(r'<\?.*?\?>', Comment.Preproc),
('<![^>]*>', Comment.Preproc),
(r'<\s*[\w:.-]+', Name.Tag, 'tag'),
(r'<\s*/\s*[\w:.-]+\s*>', Name.Tag),
],
'tag': [
(r'\s+', Text),
(r'[\w.:-]+\s*=', Name.Attribute, 'attr'),
(r'/?\s*>', Name.Tag, '#pop'),
],
'attr': [
(r'\s+', Text),
('".*?"', String, '#pop'),
("'.*?'", String, '#pop'),
(r'[^\s>]+', String, '#pop'),
],
}
def analyse_text(text):
if looks_like_xml(text):
return 0.45 # less than HTML
class XsltLexer(XmlLexer):
"""
A lexer for XSLT.
.. versionadded:: 0.10
"""
name = 'XSLT'
aliases = ['xslt']
filenames = ['*.xsl', '*.xslt', '*.xpl'] # xpl is XProc
mimetypes = ['application/xsl+xml', 'application/xslt+xml']
EXTRA_KEYWORDS = {
'apply-imports', 'apply-templates', 'attribute',
'attribute-set', 'call-template', 'choose', 'comment',
'copy', 'copy-of', 'decimal-format', 'element', 'fallback',
'for-each', 'if', 'import', 'include', 'key', 'message',
'namespace-alias', 'number', 'otherwise', 'output', 'param',
'preserve-space', 'processing-instruction', 'sort',
'strip-space', 'stylesheet', 'template', 'text', 'transform',
'value-of', 'variable', 'when', 'with-param'
}
def get_tokens_unprocessed(self, text):
for index, token, value in XmlLexer.get_tokens_unprocessed(self, text):
m = re.match('</?xsl:([^>]*)/?>?', value)
if token is Name.Tag and m and m.group(1) in self.EXTRA_KEYWORDS:
yield index, Keyword, value
else:
yield index, token, value
def analyse_text(text):
if looks_like_xml(text) and '<xsl' in text:
return 0.8
class HamlLexer(ExtendedRegexLexer):
"""
For Haml markup.
.. versionadded:: 1.3
"""
name = 'Haml'
aliases = ['haml']
filenames = ['*.haml']
mimetypes = ['text/x-haml']
flags = re.IGNORECASE
# Haml can include " |\n" anywhere,
# which is ignored and used to wrap long lines.
# To accommodate this, use this custom faux dot instead.
_dot = r'(?: \|\n(?=.* \|)|.)'
# In certain places, a comma at the end of the line
# allows line wrapping as well.
_comma_dot = r'(?:,\s*\n|' + _dot + ')'
tokens = {
'root': [
(r'[ \t]*\n', Text),
(r'[ \t]*', _indentation),
],
'css': [
(r'\.[\w:-]+', Name.Class, 'tag'),
(r'\#[\w:-]+', Name.Function, 'tag'),
],
'eval-or-plain': [
(r'[&!]?==', Punctuation, 'plain'),
(r'([&!]?[=~])(' + _comma_dot + r'*\n)',
bygroups(Punctuation, using(RubyLexer)),
'root'),
default('plain'),
],
'content': [
include('css'),
(r'%[\w:-]+', Name.Tag, 'tag'),
(r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'),
(r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)',
bygroups(Comment, Comment.Special, Comment),
'#pop'),
(r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'),
'#pop'),
(r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc,
'haml-comment-block'), '#pop'),
(r'(-)(' + _comma_dot + r'*\n)',
bygroups(Punctuation, using(RubyLexer)),
'#pop'),
(r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'),
'#pop'),
include('eval-or-plain'),
],
'tag': [
include('css'),
(r'\{(,\n|' + _dot + r')*?\}', using(RubyLexer)),
(r'\[' + _dot + r'*?\]', using(RubyLexer)),
(r'\(', Text, 'html-attributes'),
(r'/[ \t]*\n', Punctuation, '#pop:2'),
(r'[<>]{1,2}(?=[ \t=])', Punctuation),
include('eval-or-plain'),
],
'plain': [
(r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text),
(r'(#\{)(' + _dot + r'*?)(\})',
bygroups(String.Interpol, using(RubyLexer), String.Interpol)),
(r'\n', Text, 'root'),
],
'html-attributes': [
(r'\s+', Text),
(r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'),
(r'[\w:-]+', Name.Attribute),
(r'\)', Text, '#pop'),
],
'html-attribute-value': [
(r'[ \t]+', Text),
(r'\w+', Name.Variable, '#pop'),
(r'@\w+', Name.Variable.Instance, '#pop'),
(r'\$\w+', Name.Variable.Global, '#pop'),
(r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'),
(r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'),
],
'html-comment-block': [
(_dot + '+', Comment),
(r'\n', Text, 'root'),
],
'haml-comment-block': [
(_dot + '+', Comment.Preproc),
(r'\n', Text, 'root'),
],
'filter-block': [
(r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator),
(r'(#\{)(' + _dot + r'*?)(\})',
bygroups(String.Interpol, using(RubyLexer), String.Interpol)),
(r'\n', Text, 'root'),
],
}
class ScamlLexer(ExtendedRegexLexer):
"""
For `Scaml markup <http://scalate.fusesource.org/>`_. Scaml is Haml for Scala.
.. versionadded:: 1.4
"""
name = 'Scaml'
aliases = ['scaml']
filenames = ['*.scaml']
mimetypes = ['text/x-scaml']
flags = re.IGNORECASE
# Scaml does not yet support the " |\n" notation to
# wrap long lines. Once it does, use the custom faux
# dot instead.
# _dot = r'(?: \|\n(?=.* \|)|.)'
_dot = r'.'
tokens = {
'root': [
(r'[ \t]*\n', Text),
(r'[ \t]*', _indentation),
],
'css': [
(r'\.[\w:-]+', Name.Class, 'tag'),
(r'\#[\w:-]+', Name.Function, 'tag'),
],
'eval-or-plain': [
(r'[&!]?==', Punctuation, 'plain'),
(r'([&!]?[=~])(' + _dot + r'*\n)',
bygroups(Punctuation, using(ScalaLexer)),
'root'),
default('plain'),
],
'content': [
include('css'),
(r'%[\w:-]+', Name.Tag, 'tag'),
(r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'),
(r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)',
bygroups(Comment, Comment.Special, Comment),
'#pop'),
(r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'),
'#pop'),
(r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc,
'scaml-comment-block'), '#pop'),
(r'(-@\s*)(import)?(' + _dot + r'*\n)',
bygroups(Punctuation, Keyword, using(ScalaLexer)),
'#pop'),
(r'(-)(' + _dot + r'*\n)',
bygroups(Punctuation, using(ScalaLexer)),
'#pop'),
(r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'),
'#pop'),
include('eval-or-plain'),
],
'tag': [
include('css'),
(r'\{(,\n|' + _dot + r')*?\}', using(ScalaLexer)),
(r'\[' + _dot + r'*?\]', using(ScalaLexer)),
(r'\(', Text, 'html-attributes'),
(r'/[ \t]*\n', Punctuation, '#pop:2'),
(r'[<>]{1,2}(?=[ \t=])', Punctuation),
include('eval-or-plain'),
],
'plain': [
(r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text),
(r'(#\{)(' + _dot + r'*?)(\})',
bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
(r'\n', Text, 'root'),
],
'html-attributes': [
(r'\s+', Text),
(r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'),
(r'[\w:-]+', Name.Attribute),
(r'\)', Text, '#pop'),
],
'html-attribute-value': [
(r'[ \t]+', Text),
(r'\w+', Name.Variable, '#pop'),
(r'@\w+', Name.Variable.Instance, '#pop'),
(r'\$\w+', Name.Variable.Global, '#pop'),
(r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'),
(r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'),
],
'html-comment-block': [
(_dot + '+', Comment),
(r'\n', Text, 'root'),
],
'scaml-comment-block': [
(_dot + '+', Comment.Preproc),
(r'\n', Text, 'root'),
],
'filter-block': [
(r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator),
(r'(#\{)(' + _dot + r'*?)(\})',
bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
(r'\n', Text, 'root'),
],
}
class PugLexer(ExtendedRegexLexer):
"""
For Pug markup.
Pug is a variant of Scaml, see:
http://scalate.fusesource.org/documentation/scaml-reference.html
.. versionadded:: 1.4
"""
name = 'Pug'
aliases = ['pug', 'jade']
filenames = ['*.pug', '*.jade']
mimetypes = ['text/x-pug', 'text/x-jade']
flags = re.IGNORECASE
_dot = r'.'
tokens = {
'root': [
(r'[ \t]*\n', Text),
(r'[ \t]*', _indentation),
],
'css': [
(r'\.[\w:-]+', Name.Class, 'tag'),
(r'\#[\w:-]+', Name.Function, 'tag'),
],
'eval-or-plain': [
(r'[&!]?==', Punctuation, 'plain'),
(r'([&!]?[=~])(' + _dot + r'*\n)',
bygroups(Punctuation, using(ScalaLexer)), 'root'),
default('plain'),
],
'content': [
include('css'),
(r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'),
(r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)',
bygroups(Comment, Comment.Special, Comment),
'#pop'),
(r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'),
'#pop'),
(r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc,
'scaml-comment-block'), '#pop'),
(r'(-@\s*)(import)?(' + _dot + r'*\n)',
bygroups(Punctuation, Keyword, using(ScalaLexer)),
'#pop'),
(r'(-)(' + _dot + r'*\n)',
bygroups(Punctuation, using(ScalaLexer)),
'#pop'),
(r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'),
'#pop'),
(r'[\w:-]+', Name.Tag, 'tag'),
(r'\|', Text, 'eval-or-plain'),
],
'tag': [
include('css'),
(r'\{(,\n|' + _dot + r')*?\}', using(ScalaLexer)),
(r'\[' + _dot + r'*?\]', using(ScalaLexer)),
(r'\(', Text, 'html-attributes'),
(r'/[ \t]*\n', Punctuation, '#pop:2'),
(r'[<>]{1,2}(?=[ \t=])', Punctuation),
include('eval-or-plain'),
],
'plain': [
(r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text),
(r'(#\{)(' + _dot + r'*?)(\})',
bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
(r'\n', Text, 'root'),
],
'html-attributes': [
(r'\s+', Text),
(r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'),
(r'[\w:-]+', Name.Attribute),
(r'\)', Text, '#pop'),
],
'html-attribute-value': [
(r'[ \t]+', Text),
(r'\w+', Name.Variable, '#pop'),
(r'@\w+', Name.Variable.Instance, '#pop'),
(r'\$\w+', Name.Variable.Global, '#pop'),
(r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'),
(r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'),
],
'html-comment-block': [
(_dot + '+', Comment),
(r'\n', Text, 'root'),
],
'scaml-comment-block': [
(_dot + '+', Comment.Preproc),
(r'\n', Text, 'root'),
],
'filter-block': [
(r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator),
(r'(#\{)(' + _dot + r'*?)(\})',
bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
(r'\n', Text, 'root'),
],
}
JadeLexer = PugLexer # compat