usse/scrape/venv/lib/python3.10/site-packages/docutils/io.py

665 lines
23 KiB
Python
Raw Normal View History

2023-12-22 14:26:01 +00:00
# $Id: io.py 9369 2023-05-02 23:04:27Z milde $
# Author: David Goodger <goodger@python.org>
# Copyright: This module has been placed in the public domain.
"""
I/O classes provide a uniform API for low-level input and output. Subclasses
exist for a variety of input/output mechanisms.
"""
__docformat__ = 'reStructuredText'
import codecs
import locale
import os
import re
import sys
import warnings
from docutils import TransformSpec
# Guess the locale's preferred encoding.
# If no valid guess can be made, _locale_encoding is set to `None`:
#
# TODO: check whether this is set correctly with every OS and Python version
# or whether front-end tools need to call `locale.setlocale()`
# before importing this module
try:
# Return locale encoding also in UTF-8 mode
with warnings.catch_warnings():
warnings.simplefilter("ignore")
_locale_encoding = (locale.getlocale()[1]
or locale.getdefaultlocale()[1])
_locale_encoding = _locale_encoding.lower()
except ValueError as error: # OS X may set UTF-8 without language code
# See https://bugs.python.org/issue18378 fixed in 3.8
# and https://sourceforge.net/p/docutils/bugs/298/.
# Drop the special case after requiring Python >= 3.8
if "unknown locale: UTF-8" in error.args:
_locale_encoding = "utf-8"
else:
_locale_encoding = None
except: # noqa any other problems determining the locale -> use None
_locale_encoding = None
try:
codecs.lookup(_locale_encoding)
except (LookupError, TypeError):
_locale_encoding = None
class InputError(OSError): pass
class OutputError(OSError): pass
def check_encoding(stream, encoding):
"""Test, whether the encoding of `stream` matches `encoding`.
Returns
:None: if `encoding` or `stream.encoding` are not a valid encoding
argument (e.g. ``None``) or `stream.encoding is missing.
:True: if the encoding argument resolves to the same value as `encoding`,
:False: if the encodings differ.
"""
try:
return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
except (LookupError, AttributeError, TypeError):
return None
def error_string(err):
"""Return string representation of Exception `err`.
"""
return f'{err.__class__.__name__}: {err}'
class Input(TransformSpec):
"""
Abstract base class for input wrappers.
Docutils input objects must provide a `read()` method that
returns the source, typically as `str` instance.
Inheriting `TransformSpec` allows input objects to add
"transforms" and "unknown_reference_resolvers" to the "Transformer".
(Optional for custom input objects since Docutils 0.19.)
"""
component_type = 'input'
default_source_path = None
def __init__(self, source=None, source_path=None, encoding=None,
error_handler='strict'):
self.encoding = encoding
"""Text encoding for the input source."""
self.error_handler = error_handler
"""Text decoding error handler."""
self.source = source
"""The source of input data."""
self.source_path = source_path
"""A text reference to the source."""
if not source_path:
self.source_path = self.default_source_path
self.successful_encoding = None
"""The encoding that successfully decoded the source data."""
def __repr__(self):
return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
self.source_path)
def read(self):
"""Return input as `str`. Define in subclasses."""
raise NotImplementedError
def decode(self, data):
"""
Decode `data` if required.
Return Unicode `str` instances unchanged (nothing to decode).
If `self.encoding` is None, determine encoding from data
or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.
The client application should call ``locale.setlocale`` at the
beginning of processing::
locale.setlocale(locale.LC_ALL, '')
Raise UnicodeError if unsuccessful.
Provisional:
- Raise UnicodeError (instead of falling back to the locale
encoding) if decoding the source with the default encoding (UTF-8)
fails and Python is started in `UTF-8 mode`.
Raise UnicodeError (instead of falling back to "latin1") if both,
default and locale encoding, fail.
- Only remove BOM (U+FEFF ZWNBSP at start of data),
no other ZWNBSPs.
"""
if self.encoding and self.encoding.lower() == 'unicode':
assert isinstance(data, str), ('input encoding is "unicode" '
'but `data` is no `str` instance')
if isinstance(data, str):
# nothing to decode
return data
if self.encoding:
# We believe the user/application when the encoding is
# explicitly given.
encoding_candidates = [self.encoding]
else:
data_encoding = self.determine_encoding_from_data(data)
if data_encoding:
# If the data declares its encoding (explicitly or via a BOM),
# we believe it.
encoding_candidates = [data_encoding]
else:
# Apply heuristics only if no encoding is explicitly given and
# no BOM found. Start with UTF-8, because that only matches
# data that *IS* UTF-8:
encoding_candidates = ['utf-8']
# TODO: use `locale.getpreferredlocale(do_setlocale=True)`
# to respect UTF-8 mode (API change).
# (Check if it is a valid encoding and not UTF-8)
if _locale_encoding and _locale_encoding != 'utf-8':
encoding_candidates.append(_locale_encoding)
# TODO: don't fall back to 'latin-1' (API change).
encoding_candidates.append('latin-1')
for enc in encoding_candidates:
try:
decoded = str(data, enc, self.error_handler)
self.successful_encoding = enc
# Return decoded, removing BOM and other ZWNBSPs.
# TODO: only remove BOM (ZWNBSP at start of data)
# and only if 'self.encoding' is None. (API change)
return decoded.replace('\ufeff', '')
except (UnicodeError, LookupError) as err:
# keep exception instance for use outside of the "for" loop.
error = err
raise UnicodeError(
'Unable to decode input data. Tried the following encodings: '
f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'
f'({error_string(error)})')
coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
"""Encoding declaration pattern."""
byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),
(codecs.BOM_UTF16_BE, 'utf-16-be'),
(codecs.BOM_UTF16_LE, 'utf-16-le'),)
"""Sequence of (start_bytes, encoding) tuples for encoding detection.
The first bytes of input data are checked against the start_bytes strings.
A match indicates the given encoding."""
def determine_encoding_from_data(self, data):
"""
Try to determine the encoding of `data` by looking *in* `data`.
Check for a byte order mark (BOM) or an encoding declaration.
"""
# check for a byte order mark:
for start_bytes, encoding in self.byte_order_marks:
if data.startswith(start_bytes):
return encoding
# check for an encoding declaration pattern in first 2 lines of file:
for line in data.splitlines()[:2]:
match = self.coding_slug.search(line)
if match:
return match.group(1).decode('ascii')
return None
def isatty(self):
"""Return True, if the input source is connected to a TTY device."""
try:
return self.source.isatty()
except AttributeError:
return False
class Output(TransformSpec):
"""
Abstract base class for output wrappers.
Docutils output objects must provide a `write()` method that
expects and handles one argument (the output).
Inheriting `TransformSpec` allows output objects to add
"transforms" and "unknown_reference_resolvers" to the "Transformer".
(Optional for custom output objects since Docutils 0.19.)
"""
component_type = 'output'
default_destination_path = None
def __init__(self, destination=None, destination_path=None,
encoding=None, error_handler='strict'):
self.encoding = encoding
"""Text encoding for the output destination."""
self.error_handler = error_handler or 'strict'
"""Text encoding error handler."""
self.destination = destination
"""The destination for output data."""
self.destination_path = destination_path
"""A text reference to the destination."""
if not destination_path:
self.destination_path = self.default_destination_path
def __repr__(self):
return ('%s: destination=%r, destination_path=%r'
% (self.__class__, self.destination, self.destination_path))
def write(self, data):
"""Write `data`. Define in subclasses."""
raise NotImplementedError
def encode(self, data):
"""
Encode and return `data`.
If `data` is a `bytes` instance, it is returned unchanged.
Otherwise it is encoded with `self.encoding`.
Provisional: If `self.encoding` is set to the pseudo encoding name
"unicode", `data` must be a `str` instance and is returned unchanged.
"""
if self.encoding and self.encoding.lower() == 'unicode':
assert isinstance(data, str), ('output encoding is "unicode" '
'but `data` is no `str` instance')
return data
if not isinstance(data, str):
# Non-unicode (e.g. bytes) output.
return data
else:
return data.encode(self.encoding, self.error_handler)
class ErrorOutput:
"""
Wrapper class for file-like error streams with
failsafe de- and encoding of `str`, `bytes`, `unicode` and
`Exception` instances.
"""
def __init__(self, destination=None, encoding=None,
encoding_errors='backslashreplace',
decoding_errors='replace'):
"""
:Parameters:
- `destination`: a file-like object,
a string (path to a file),
`None` (write to `sys.stderr`, default), or
evaluating to `False` (write() requests are ignored).
- `encoding`: `destination` text encoding. Guessed if None.
- `encoding_errors`: how to treat encoding errors.
"""
if destination is None:
destination = sys.stderr
elif not destination:
destination = False
# if `destination` is a file name, open it
elif isinstance(destination, str):
destination = open(destination, 'w')
self.destination = destination
"""Where warning output is sent."""
self.encoding = (encoding or getattr(destination, 'encoding', None)
or _locale_encoding or 'ascii')
"""The output character encoding."""
self.encoding_errors = encoding_errors
"""Encoding error handler."""
self.decoding_errors = decoding_errors
"""Decoding error handler."""
def write(self, data):
"""
Write `data` to self.destination. Ignore, if self.destination is False.
`data` can be a `bytes`, `str`, or `Exception` instance.
"""
if not self.destination:
return
if isinstance(data, Exception):
data = str(data)
try:
self.destination.write(data)
except UnicodeEncodeError:
self.destination.write(data.encode(self.encoding,
self.encoding_errors))
except TypeError:
if isinstance(data, str): # destination may expect bytes
self.destination.write(data.encode(self.encoding,
self.encoding_errors))
elif self.destination in (sys.stderr, sys.stdout):
# write bytes to raw stream
self.destination.buffer.write(data)
else:
self.destination.write(str(data, self.encoding,
self.decoding_errors))
def close(self):
"""
Close the error-output stream.
Ignored if the destination is` sys.stderr` or `sys.stdout` or has no
close() method.
"""
if self.destination in (sys.stdout, sys.stderr):
return
try:
self.destination.close()
except AttributeError:
pass
def isatty(self):
"""Return True, if the destination is connected to a TTY device."""
try:
return self.destination.isatty()
except AttributeError:
return False
class FileInput(Input):
"""
Input for single, simple file-like objects.
"""
def __init__(self, source=None, source_path=None,
encoding=None, error_handler='strict',
autoclose=True, mode='r'):
"""
:Parameters:
- `source`: either a file-like object (which is read directly), or
`None` (which implies `sys.stdin` if no `source_path` given).
- `source_path`: a path to a file, which is opened and then read.
- `encoding`: the expected text encoding of the input file.
- `error_handler`: the encoding error handler to use.
- `autoclose`: close automatically after read (except when
`sys.stdin` is the source).
- `mode`: how the file is to be opened (see standard function
`open`). The default is read only ('r').
"""
Input.__init__(self, source, source_path, encoding, error_handler)
self.autoclose = autoclose
self._stderr = ErrorOutput()
if source is None:
if source_path:
try:
self.source = open(source_path, mode,
encoding=self.encoding or 'utf-8-sig',
errors=self.error_handler)
except OSError as error:
raise InputError(error.errno, error.strerror, source_path)
else:
self.source = sys.stdin
elif check_encoding(self.source, self.encoding) is False:
# TODO: re-open, warn or raise error?
raise UnicodeError('Encoding clash: encoding given is "%s" '
'but source is opened with encoding "%s".' %
(self.encoding, self.source.encoding))
if not source_path:
try:
self.source_path = self.source.name
except AttributeError:
pass
def read(self):
"""
Read and decode a single file and return the data (Unicode string).
"""
try:
if self.source is sys.stdin:
# read as binary data to circumvent auto-decoding
data = self.source.buffer.read()
else:
data = self.source.read()
except (UnicodeError, LookupError):
if not self.encoding and self.source_path:
# re-read in binary mode and decode with heuristics
b_source = open(self.source_path, 'rb')
data = b_source.read()
b_source.close()
else:
raise
finally:
if self.autoclose:
self.close()
data = self.decode(data)
# normalise newlines
return '\n'.join(data.splitlines()+[''])
def readlines(self):
"""
Return lines of a single file as list of Unicode strings.
"""
return self.read().splitlines(True)
def close(self):
if self.source is not sys.stdin:
self.source.close()
class FileOutput(Output):
"""Output for single, simple file-like objects."""
default_destination_path = '<file>'
mode = 'w'
"""The mode argument for `open()`."""
# 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
# (Do not use binary mode ('wb') for text files, as this prevents the
# conversion of newlines to the system specific default.)
def __init__(self, destination=None, destination_path=None,
encoding=None, error_handler='strict', autoclose=True,
handle_io_errors=None, mode=None):
"""
:Parameters:
- `destination`: either a file-like object (which is written
directly) or `None` (which implies `sys.stdout` if no
`destination_path` given).
- `destination_path`: a path to a file, which is opened and then
written.
- `encoding`: the text encoding of the output file.
- `error_handler`: the encoding error handler to use.
- `autoclose`: close automatically after write (except when
`sys.stdout` or `sys.stderr` is the destination).
- `handle_io_errors`: ignored, deprecated, will be removed.
- `mode`: how the file is to be opened (see standard function
`open`). The default is 'w', providing universal newline
support for text files.
"""
Output.__init__(self, destination, destination_path,
encoding, error_handler)
self.opened = True
self.autoclose = autoclose
if handle_io_errors is not None:
warnings.warn('io.FileOutput: init argument "handle_io_errors" '
'is ignored and will be removed in '
'Docutils 2.0.', DeprecationWarning, stacklevel=2)
if mode is not None:
self.mode = mode
self._stderr = ErrorOutput()
if destination is None:
if destination_path:
self.opened = False
else:
self.destination = sys.stdout
elif ( # destination is file-type object -> check mode:
mode and hasattr(self.destination, 'mode')
and mode != self.destination.mode):
print('Warning: Destination mode "%s" differs from specified '
'mode "%s"' % (self.destination.mode, mode),
file=self._stderr)
if not destination_path:
try:
self.destination_path = self.destination.name
except AttributeError:
pass
def open(self):
# Specify encoding
if 'b' not in self.mode:
kwargs = {'encoding': self.encoding,
'errors': self.error_handler}
else:
kwargs = {}
try:
self.destination = open(self.destination_path, self.mode, **kwargs)
except OSError as error:
raise OutputError(error.errno, error.strerror,
self.destination_path)
self.opened = True
def write(self, data):
"""Write `data` to a single file, also return it.
`data` can be a `str` or `bytes` instance.
If writing `bytes` fails, an attempt is made to write to
the low-level interface ``self.destination.buffer``.
If `data` is a `str` instance and `self.encoding` and
`self.destination.encoding` are set to different values, `data`
is encoded to a `bytes` instance using `self.encoding`.
Provisional: future versions may raise an error if `self.encoding`
and `self.destination.encoding` are set to different values.
"""
if not self.opened:
self.open()
if (isinstance(data, str)
and check_encoding(self.destination, self.encoding) is False):
if os.linesep != '\n':
data = data.replace('\n', os.linesep) # fix endings
data = self.encode(data)
try:
self.destination.write(data)
except TypeError as err:
if isinstance(data, bytes):
try:
self.destination.buffer.write(data)
except AttributeError:
if check_encoding(self.destination,
self.encoding) is False:
raise ValueError(
f'Encoding of {self.destination_path} '
f'({self.destination.encoding}) differs \n'
f' from specified encoding ({self.encoding})')
else:
raise err
except (UnicodeError, LookupError) as err:
raise UnicodeError(
'Unable to encode output data. output-encoding is: '
f'{self.encoding}.\n({error_string(err)})')
finally:
if self.autoclose:
self.close()
return data
def close(self):
if self.destination not in (sys.stdout, sys.stderr):
self.destination.close()
self.opened = False
class BinaryFileOutput(FileOutput):
"""
A version of docutils.io.FileOutput which writes to a binary file.
"""
# Used by core.publish_cmdline_to_binary() which in turn is used by
# tools/rst2odt.py but not by core.rst2odt().
mode = 'wb'
class StringInput(Input):
"""Input from a `str` or `bytes` instance."""
default_source_path = '<string>'
def read(self):
"""Return the source as `str` instance.
Decode, if required (see `Input.decode`).
"""
return self.decode(self.source)
class StringOutput(Output):
"""Output to a `bytes` or `str` instance.
Provisional.
"""
default_destination_path = '<string>'
def write(self, data):
"""Store `data` in `self.destination`, and return it.
If `self.encoding` is set to the pseudo encoding name "unicode",
`data` must be a `str` instance and is stored/returned unchanged
(cf. `Output.encode`).
Otherwise, `data` can be a `bytes` or `str` instance and is
stored/returned as a `bytes` instance
(`str` data is encoded with `self.encode()`).
Attention: the `output_encoding`_ setting may affect the content
of the output (e.g. an encoding declaration in HTML or XML or the
representation of characters as LaTeX macro vs. literal character).
"""
self.destination = self.encode(data)
return self.destination
class NullInput(Input):
"""Degenerate input: read nothing."""
default_source_path = 'null input'
def read(self):
"""Return an empty string."""
return ''
class NullOutput(Output):
"""Degenerate output: write nothing."""
default_destination_path = 'null output'
def write(self, data):
"""Do nothing, return None."""
pass
class DocTreeInput(Input):
"""
Adapter for document tree input.
The document tree must be passed in the ``source`` parameter.
"""
default_source_path = 'doctree input'
def read(self):
"""Return the document tree."""
return self.source