566 lines
21 KiB
Python
566 lines
21 KiB
Python
# $Id: frontmatter.py 9351 2023-04-17 20:26:33Z milde $
|
|
# Author: David Goodger, Ueli Schlaepfer <goodger@python.org>
|
|
# Copyright: This module has been placed in the public domain.
|
|
|
|
"""
|
|
Transforms related to the front matter of a document or a section
|
|
(information found before the main text):
|
|
|
|
- `DocTitle`: Used to transform a lone top level section's title to
|
|
the document title, promote a remaining lone top-level section's
|
|
title to the document subtitle, and determine the document's title
|
|
metadata (document['title']) based on the document title and/or the
|
|
"title" setting.
|
|
|
|
- `SectionSubTitle`: Used to transform a lone subsection into a
|
|
subtitle.
|
|
|
|
- `DocInfo`: Used to transform a bibliographic field list into docinfo
|
|
elements.
|
|
"""
|
|
|
|
__docformat__ = 'reStructuredText'
|
|
|
|
import re
|
|
|
|
from docutils import nodes, parsers, utils
|
|
from docutils.transforms import TransformError, Transform
|
|
|
|
|
|
class TitlePromoter(Transform):
|
|
|
|
"""
|
|
Abstract base class for DocTitle and SectionSubTitle transforms.
|
|
"""
|
|
|
|
def promote_title(self, node):
|
|
"""
|
|
Transform the following tree::
|
|
|
|
<node>
|
|
<section>
|
|
<title>
|
|
...
|
|
|
|
into ::
|
|
|
|
<node>
|
|
<title>
|
|
...
|
|
|
|
`node` is normally a document.
|
|
"""
|
|
# Type check
|
|
if not isinstance(node, nodes.Element):
|
|
raise TypeError('node must be of Element-derived type.')
|
|
|
|
# `node` must not have a title yet.
|
|
assert not (len(node) and isinstance(node[0], nodes.title))
|
|
section, index = self.candidate_index(node)
|
|
if index is None:
|
|
return False
|
|
|
|
# Transfer the section's attributes to the node:
|
|
# NOTE: Change `replace` to False to NOT replace attributes that
|
|
# already exist in node with those in section.
|
|
# NOTE: Remove `and_source` to NOT copy the 'source'
|
|
# attribute from section
|
|
node.update_all_atts_concatenating(section, replace=True,
|
|
and_source=True)
|
|
|
|
# setup_child is called automatically for all nodes.
|
|
node[:] = (section[:1] # section title
|
|
+ node[:index] # everything that was in the
|
|
# node before the section
|
|
+ section[1:]) # everything that was in the section
|
|
assert isinstance(node[0], nodes.title)
|
|
return True
|
|
|
|
def promote_subtitle(self, node):
|
|
"""
|
|
Transform the following node tree::
|
|
|
|
<node>
|
|
<title>
|
|
<section>
|
|
<title>
|
|
...
|
|
|
|
into ::
|
|
|
|
<node>
|
|
<title>
|
|
<subtitle>
|
|
...
|
|
"""
|
|
# Type check
|
|
if not isinstance(node, nodes.Element):
|
|
raise TypeError('node must be of Element-derived type.')
|
|
|
|
subsection, index = self.candidate_index(node)
|
|
if index is None:
|
|
return False
|
|
subtitle = nodes.subtitle()
|
|
|
|
# Transfer the subsection's attributes to the new subtitle
|
|
# NOTE: Change `replace` to False to NOT replace attributes
|
|
# that already exist in node with those in section.
|
|
# NOTE: Remove `and_source` to NOT copy the 'source'
|
|
# attribute from section.
|
|
subtitle.update_all_atts_concatenating(subsection, replace=True,
|
|
and_source=True)
|
|
|
|
# Transfer the contents of the subsection's title to the
|
|
# subtitle:
|
|
subtitle[:] = subsection[0][:]
|
|
node[:] = (node[:1] # title
|
|
+ [subtitle]
|
|
# everything that was before the section:
|
|
+ node[1:index]
|
|
# everything that was in the subsection:
|
|
+ subsection[1:])
|
|
return True
|
|
|
|
def candidate_index(self, node):
|
|
"""
|
|
Find and return the promotion candidate and its index.
|
|
|
|
Return (None, None) if no valid candidate was found.
|
|
"""
|
|
index = node.first_child_not_matching_class(
|
|
nodes.PreBibliographic)
|
|
if (index is None or len(node) > (index + 1)
|
|
or not isinstance(node[index], nodes.section)):
|
|
return None, None
|
|
else:
|
|
return node[index], index
|
|
|
|
|
|
class DocTitle(TitlePromoter):
|
|
|
|
"""
|
|
In reStructuredText_, there is no way to specify a document title
|
|
and subtitle explicitly. Instead, we can supply the document title
|
|
(and possibly the subtitle as well) implicitly, and use this
|
|
two-step transform to "raise" or "promote" the title(s) (and their
|
|
corresponding section contents) to the document level.
|
|
|
|
1. If the document contains a single top-level section as its
|
|
first non-comment element, the top-level section's title
|
|
becomes the document's title, and the top-level section's
|
|
contents become the document's immediate contents. The lone
|
|
top-level section header must be the first non-comment element
|
|
in the document.
|
|
|
|
For example, take this input text::
|
|
|
|
=================
|
|
Top-Level Title
|
|
=================
|
|
|
|
A paragraph.
|
|
|
|
Once parsed, it looks like this::
|
|
|
|
<document>
|
|
<section names="top-level title">
|
|
<title>
|
|
Top-Level Title
|
|
<paragraph>
|
|
A paragraph.
|
|
|
|
After running the DocTitle transform, we have::
|
|
|
|
<document names="top-level title">
|
|
<title>
|
|
Top-Level Title
|
|
<paragraph>
|
|
A paragraph.
|
|
|
|
2. If step 1 successfully determines the document title, we
|
|
continue by checking for a subtitle.
|
|
|
|
If the lone top-level section itself contains a single
|
|
second-level section as its first non-comment element, that
|
|
section's title is promoted to the document's subtitle, and
|
|
that section's contents become the document's immediate
|
|
contents. Given this input text::
|
|
|
|
=================
|
|
Top-Level Title
|
|
=================
|
|
|
|
Second-Level Title
|
|
~~~~~~~~~~~~~~~~~~
|
|
|
|
A paragraph.
|
|
|
|
After parsing and running the Section Promotion transform, the
|
|
result is::
|
|
|
|
<document names="top-level title">
|
|
<title>
|
|
Top-Level Title
|
|
<subtitle names="second-level title">
|
|
Second-Level Title
|
|
<paragraph>
|
|
A paragraph.
|
|
|
|
(Note that the implicit hyperlink target generated by the
|
|
"Second-Level Title" is preserved on the "subtitle" element
|
|
itself.)
|
|
|
|
Any comment elements occurring before the document title or
|
|
subtitle are accumulated and inserted as the first body elements
|
|
after the title(s).
|
|
|
|
This transform also sets the document's metadata title
|
|
(document['title']).
|
|
|
|
.. _reStructuredText: https://docutils.sourceforge.io/rst.html
|
|
"""
|
|
|
|
default_priority = 320
|
|
|
|
def set_metadata(self):
|
|
"""
|
|
Set document['title'] metadata title from the following
|
|
sources, listed in order of priority:
|
|
|
|
* Existing document['title'] attribute.
|
|
* "title" setting.
|
|
* Document title node (as promoted by promote_title).
|
|
"""
|
|
if not self.document.hasattr('title'):
|
|
if self.document.settings.title is not None:
|
|
self.document['title'] = self.document.settings.title
|
|
elif len(self.document) and isinstance(self.document[0],
|
|
nodes.title):
|
|
self.document['title'] = self.document[0].astext()
|
|
|
|
def apply(self):
|
|
if self.document.settings.setdefault('doctitle_xform', True):
|
|
# promote_(sub)title defined in TitlePromoter base class.
|
|
if self.promote_title(self.document):
|
|
# If a title has been promoted, also try to promote a
|
|
# subtitle.
|
|
self.promote_subtitle(self.document)
|
|
# Set document['title'].
|
|
self.set_metadata()
|
|
|
|
|
|
class SectionSubTitle(TitlePromoter):
|
|
|
|
"""
|
|
This works like document subtitles, but for sections. For example, ::
|
|
|
|
<section>
|
|
<title>
|
|
Title
|
|
<section>
|
|
<title>
|
|
Subtitle
|
|
...
|
|
|
|
is transformed into ::
|
|
|
|
<section>
|
|
<title>
|
|
Title
|
|
<subtitle>
|
|
Subtitle
|
|
...
|
|
|
|
For details refer to the docstring of DocTitle.
|
|
"""
|
|
|
|
default_priority = 350
|
|
|
|
def apply(self):
|
|
if not self.document.settings.setdefault('sectsubtitle_xform', True):
|
|
return
|
|
for section in self.document.findall(nodes.section):
|
|
# On our way through the node tree, we are modifying it
|
|
# but only the not-yet-visited part, so that the iterator
|
|
# returned by findall() is not corrupted.
|
|
self.promote_subtitle(section)
|
|
|
|
|
|
class DocInfo(Transform):
|
|
|
|
"""
|
|
This transform is specific to the reStructuredText_ markup syntax;
|
|
see "Bibliographic Fields" in the `reStructuredText Markup
|
|
Specification`_ for a high-level description. This transform
|
|
should be run *after* the `DocTitle` transform.
|
|
|
|
Given a field list as the first non-comment element after the
|
|
document title and subtitle (if present), registered bibliographic
|
|
field names are transformed to the corresponding DTD elements,
|
|
becoming child elements of the "docinfo" element (except for a
|
|
dedication and/or an abstract, which become "topic" elements after
|
|
"docinfo").
|
|
|
|
For example, given this document fragment after parsing::
|
|
|
|
<document>
|
|
<title>
|
|
Document Title
|
|
<field_list>
|
|
<field>
|
|
<field_name>
|
|
Author
|
|
<field_body>
|
|
<paragraph>
|
|
A. Name
|
|
<field>
|
|
<field_name>
|
|
Status
|
|
<field_body>
|
|
<paragraph>
|
|
$RCSfile$
|
|
...
|
|
|
|
After running the bibliographic field list transform, the
|
|
resulting document tree would look like this::
|
|
|
|
<document>
|
|
<title>
|
|
Document Title
|
|
<docinfo>
|
|
<author>
|
|
A. Name
|
|
<status>
|
|
frontmatter.py
|
|
...
|
|
|
|
The "Status" field contained an expanded RCS keyword, which is
|
|
normally (but optionally) cleaned up by the transform. The sole
|
|
contents of the field body must be a paragraph containing an
|
|
expanded RCS keyword of the form "$keyword: expansion text $". Any
|
|
RCS keyword can be processed in any bibliographic field. The
|
|
dollar signs and leading RCS keyword name are removed. Extra
|
|
processing is done for the following RCS keywords:
|
|
|
|
- "RCSfile" expands to the name of the file in the RCS or CVS
|
|
repository, which is the name of the source file with a ",v"
|
|
suffix appended. The transform will remove the ",v" suffix.
|
|
|
|
- "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
|
|
time zone). The RCS Keywords transform will extract just the
|
|
date itself and transform it to an ISO 8601 format date, as in
|
|
"2000-12-31".
|
|
|
|
(Since the source file for this text is itself stored under CVS,
|
|
we can't show an example of the "Date" RCS keyword because we
|
|
can't prevent any RCS keywords used in this explanation from
|
|
being expanded. Only the "RCSfile" keyword is stable; its
|
|
expansion text changes only if the file name changes.)
|
|
|
|
.. _reStructuredText: https://docutils.sourceforge.io/rst.html
|
|
.. _reStructuredText Markup Specification:
|
|
https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html
|
|
"""
|
|
|
|
default_priority = 340
|
|
|
|
biblio_nodes = {
|
|
'author': nodes.author,
|
|
'authors': nodes.authors,
|
|
'organization': nodes.organization,
|
|
'address': nodes.address,
|
|
'contact': nodes.contact,
|
|
'version': nodes.version,
|
|
'revision': nodes.revision,
|
|
'status': nodes.status,
|
|
'date': nodes.date,
|
|
'copyright': nodes.copyright,
|
|
'dedication': nodes.topic,
|
|
'abstract': nodes.topic}
|
|
"""Canonical field name (lowcased) to node class name mapping for
|
|
bibliographic fields (field_list)."""
|
|
|
|
def apply(self):
|
|
if not self.document.settings.setdefault('docinfo_xform', True):
|
|
return
|
|
document = self.document
|
|
index = document.first_child_not_matching_class(
|
|
nodes.PreBibliographic)
|
|
if index is None:
|
|
return
|
|
candidate = document[index]
|
|
if isinstance(candidate, nodes.field_list):
|
|
biblioindex = document.first_child_not_matching_class(
|
|
(nodes.Titular, nodes.Decorative, nodes.meta))
|
|
nodelist = self.extract_bibliographic(candidate)
|
|
del document[index] # untransformed field list (candidate)
|
|
document[biblioindex:biblioindex] = nodelist
|
|
|
|
def extract_bibliographic(self, field_list):
|
|
docinfo = nodes.docinfo()
|
|
bibliofields = self.language.bibliographic_fields
|
|
labels = self.language.labels
|
|
topics = {'dedication': None, 'abstract': None}
|
|
for field in field_list:
|
|
try:
|
|
name = field[0][0].astext()
|
|
normedname = nodes.fully_normalize_name(name)
|
|
if not (len(field) == 2 and normedname in bibliofields
|
|
and self.check_empty_biblio_field(field, name)):
|
|
raise TransformError
|
|
canonical = bibliofields[normedname]
|
|
biblioclass = self.biblio_nodes[canonical]
|
|
if issubclass(biblioclass, nodes.TextElement):
|
|
if not self.check_compound_biblio_field(field, name):
|
|
raise TransformError
|
|
utils.clean_rcs_keywords(
|
|
field[1][0], self.rcs_keyword_substitutions)
|
|
docinfo.append(biblioclass('', '', *field[1][0]))
|
|
elif issubclass(biblioclass, nodes.authors):
|
|
self.extract_authors(field, name, docinfo)
|
|
elif issubclass(biblioclass, nodes.topic):
|
|
if topics[canonical]:
|
|
field[-1] += self.document.reporter.warning(
|
|
'There can only be one "%s" field.' % name,
|
|
base_node=field)
|
|
raise TransformError
|
|
title = nodes.title(name, labels[canonical])
|
|
title[0].rawsource = labels[canonical]
|
|
topics[canonical] = biblioclass(
|
|
'', title, classes=[canonical], *field[1].children)
|
|
else:
|
|
docinfo.append(biblioclass('', *field[1].children))
|
|
except TransformError:
|
|
if len(field[-1]) == 1 \
|
|
and isinstance(field[-1][0], nodes.paragraph):
|
|
utils.clean_rcs_keywords(
|
|
field[-1][0], self.rcs_keyword_substitutions)
|
|
# if normedname not in bibliofields:
|
|
classvalue = nodes.make_id(normedname)
|
|
if classvalue:
|
|
field['classes'].append(classvalue)
|
|
docinfo.append(field)
|
|
nodelist = []
|
|
if len(docinfo) != 0:
|
|
nodelist.append(docinfo)
|
|
for name in ('dedication', 'abstract'):
|
|
if topics[name]:
|
|
nodelist.append(topics[name])
|
|
return nodelist
|
|
|
|
def check_empty_biblio_field(self, field, name):
|
|
if len(field[-1]) < 1:
|
|
field[-1] += self.document.reporter.warning(
|
|
f'Cannot extract empty bibliographic field "{name}".',
|
|
base_node=field)
|
|
return False
|
|
return True
|
|
|
|
def check_compound_biblio_field(self, field, name):
|
|
# Check that the `field` body contains a single paragraph
|
|
# (i.e. it must *not* be a compound element).
|
|
f_body = field[-1]
|
|
if len(f_body) == 1 and isinstance(f_body[0], nodes.paragraph):
|
|
return True
|
|
# Restore single author name with initial (E. Xampl) parsed as
|
|
# enumerated list
|
|
# https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#enumerated-lists
|
|
if (isinstance(f_body[0], nodes.enumerated_list)
|
|
and '\n' not in f_body.rawsource.strip()):
|
|
# parse into a dummy document and use created nodes
|
|
_document = utils.new_document('*DocInfo transform*',
|
|
field.document.settings)
|
|
parser = parsers.rst.Parser()
|
|
parser.parse('\\'+f_body.rawsource, _document)
|
|
if (len(_document.children) == 1
|
|
and isinstance(_document.children[0], nodes.paragraph)):
|
|
f_body.children = _document.children
|
|
return True
|
|
# Check failed, add a warning
|
|
content = [f'<{e.tagname}>' for e in f_body.children]
|
|
if len(content) > 1:
|
|
content = '[' + ', '.join(content) + ']'
|
|
else:
|
|
content = 'a ' + content[0]
|
|
f_body += self.document.reporter.warning(
|
|
f'Bibliographic field "{name}"\nmust contain '
|
|
f'a single <paragraph>, not {content}.',
|
|
base_node=field)
|
|
return False
|
|
|
|
rcs_keyword_substitutions = [
|
|
(re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
|
|
r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
|
|
(re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
|
|
(re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1')]
|
|
|
|
def extract_authors(self, field, name, docinfo):
|
|
try:
|
|
if len(field[1]) == 1:
|
|
if isinstance(field[1][0], nodes.paragraph):
|
|
authors = self.authors_from_one_paragraph(field)
|
|
elif isinstance(field[1][0], nodes.bullet_list):
|
|
authors = self.authors_from_bullet_list(field)
|
|
else:
|
|
raise TransformError
|
|
else:
|
|
authors = self.authors_from_paragraphs(field)
|
|
authornodes = [nodes.author('', '', *author)
|
|
for author in authors if author]
|
|
if len(authornodes) >= 1:
|
|
docinfo.append(nodes.authors('', *authornodes))
|
|
else:
|
|
raise TransformError
|
|
except TransformError:
|
|
field[-1] += self.document.reporter.warning(
|
|
f'Cannot extract "{name}" from bibliographic field:\n'
|
|
f'Bibliographic field "{name}" must contain either\n'
|
|
' a single paragraph (with author names separated by one of '
|
|
f'"{"".join(self.language.author_separators)}"),\n'
|
|
' multiple paragraphs (one per author),\n'
|
|
' or a bullet list with one author name per item.\n'
|
|
'Note: Leading initials can cause (mis)recognizing names '
|
|
'as enumerated list.',
|
|
base_node=field)
|
|
raise
|
|
|
|
def authors_from_one_paragraph(self, field):
|
|
"""Return list of Text nodes with author names in `field`.
|
|
|
|
Author names must be separated by one of the "autor separators"
|
|
defined for the document language (default: ";" or ",").
|
|
"""
|
|
# @@ keep original formatting? (e.g. ``:authors: A. Test, *et-al*``)
|
|
text = ''.join(str(node)
|
|
for node in field[1].findall(nodes.Text))
|
|
if not text:
|
|
raise TransformError
|
|
for authorsep in self.language.author_separators:
|
|
# don't split at escaped `authorsep`:
|
|
pattern = '(?<!\x00)%s' % authorsep
|
|
authornames = re.split(pattern, text)
|
|
if len(authornames) > 1:
|
|
break
|
|
authornames = (name.strip() for name in authornames)
|
|
return [[nodes.Text(name)] for name in authornames if name]
|
|
|
|
def authors_from_bullet_list(self, field):
|
|
authors = []
|
|
for item in field[1][0]:
|
|
if isinstance(item, nodes.comment):
|
|
continue
|
|
if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
|
|
raise TransformError
|
|
authors.append(item[0].children)
|
|
if not authors:
|
|
raise TransformError
|
|
return authors
|
|
|
|
def authors_from_paragraphs(self, field):
|
|
for item in field[1]:
|
|
if not isinstance(item, (nodes.paragraph, nodes.comment)):
|
|
raise TransformError
|
|
authors = [item.children for item in field[1]
|
|
if not isinstance(item, nodes.comment)]
|
|
return authors
|