usse/funda-scraper/venv/lib/python3.10/site-packages/pandas/io/common.py

1146 lines
37 KiB
Python
Raw Normal View History

2023-02-20 22:38:24 +00:00
"""Common IO api utilities"""
from __future__ import annotations
import bz2
import codecs
from collections import abc
import dataclasses
import functools
import gzip
from io import (
BufferedIOBase,
BytesIO,
RawIOBase,
StringIO,
TextIOBase,
TextIOWrapper,
)
import mmap
import os
from pathlib import Path
import re
from typing import (
IO,
Any,
AnyStr,
Generic,
Literal,
Mapping,
TypeVar,
cast,
overload,
)
from urllib.parse import (
urljoin,
urlparse as parse_url,
uses_netloc,
uses_params,
uses_relative,
)
import warnings
import zipfile
from pandas._typing import (
BaseBuffer,
CompressionDict,
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
from pandas.compat import get_lzma_file
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import is_file_like
from pandas.core.shared_docs import _shared_docs
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")
BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
@dataclasses.dataclass
class IOArgs:
"""
Return value of io/common.py:_get_filepath_or_buffer.
"""
filepath_or_buffer: str | BaseBuffer
encoding: str
mode: str
compression: CompressionDict
should_close: bool = False
@dataclasses.dataclass
class IOHandles(Generic[AnyStr]):
"""
Return value of io/common.py:get_handle
Can be used as a context manager.
This is used to easily close created buffers and to handle corner cases when
TextIOWrapper is inserted.
handle: The file handle to be used.
created_handles: All file handles that are created by get_handle
is_wrapped: Whether a TextIOWrapper needs to be detached.
"""
# handle might not implement the IO-interface
handle: IO[AnyStr]
compression: CompressionDict
created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list)
is_wrapped: bool = False
is_mmap: bool = False
def close(self) -> None:
"""
Close all created buffers.
Note: If a TextIOWrapper was inserted, it is flushed and detached to
avoid closing the potentially user-created buffer.
"""
if self.is_wrapped:
assert isinstance(self.handle, TextIOWrapper)
self.handle.flush()
self.handle.detach()
self.created_handles.remove(self.handle)
for handle in self.created_handles:
handle.close()
self.created_handles = []
self.is_wrapped = False
def __enter__(self) -> IOHandles[AnyStr]:
return self
def __exit__(self, *args: Any) -> None:
self.close()
def is_url(url: object) -> bool:
"""
Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If `url` has a valid protocol return True otherwise False.
"""
if not isinstance(url, str):
return False
return parse_url(url).scheme in _VALID_URLS
@overload
def _expand_user(filepath_or_buffer: str) -> str:
...
@overload
def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT:
...
def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:
"""
Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
Parameters
----------
filepath_or_buffer : object to be converted if possible
Returns
-------
expanded_filepath_or_buffer : an expanded filepath or the
input if not expandable
"""
if isinstance(filepath_or_buffer, str):
return os.path.expanduser(filepath_or_buffer)
return filepath_or_buffer
def validate_header_arg(header: object) -> None:
if isinstance(header, bool):
raise TypeError(
"Passing a bool to header is invalid. Use header=None for no header or "
"header=int or list-like of ints to specify "
"the row(s) making up the column names"
)
@overload
def stringify_path(filepath_or_buffer: FilePath, convert_file_like: bool = ...) -> str:
...
@overload
def stringify_path(
filepath_or_buffer: BaseBufferT, convert_file_like: bool = ...
) -> BaseBufferT:
...
def stringify_path(
filepath_or_buffer: FilePath | BaseBufferT,
convert_file_like: bool = False,
) -> str | BaseBufferT:
"""
Attempt to convert a path-like object to a string.
Parameters
----------
filepath_or_buffer : object to be converted
Returns
-------
str_filepath_or_buffer : maybe a string version of the object
Notes
-----
Objects supporting the fspath protocol (python 3.6+) are coerced
according to its __fspath__ method.
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
if not convert_file_like and is_file_like(filepath_or_buffer):
# GH 38125: some fsspec objects implement os.PathLike but have already opened a
# file. This prevents opening the file a second time. infer_compression calls
# this function with convert_file_like=True to infer the compression.
return cast(BaseBufferT, filepath_or_buffer)
if isinstance(filepath_or_buffer, os.PathLike):
filepath_or_buffer = filepath_or_buffer.__fspath__()
return _expand_user(filepath_or_buffer)
def urlopen(*args, **kwargs):
"""
Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
the stdlib.
"""
import urllib.request
return urllib.request.urlopen(*args, **kwargs)
def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
"""
Returns true if the given URL looks like
something fsspec can handle
"""
return (
isinstance(url, str)
and bool(_RFC_3986_PATTERN.match(url))
and not url.startswith(("http://", "https://"))
)
@doc(compression_options=_shared_docs["compression_options"] % "filepath_or_buffer")
def _get_filepath_or_buffer(
filepath_or_buffer: FilePath | BaseBuffer,
encoding: str = "utf-8",
compression: CompressionOptions = None,
mode: str = "r",
storage_options: StorageOptions = None,
) -> IOArgs:
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
Parameters
----------
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
or buffer
{compression_options}
.. versionchanged:: 1.4.0 Zstandard support.
encoding : the encoding to use to decode bytes, default is 'utf-8'
mode : str, optional
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc., if using a URL that will
be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
will be raised if providing this argument with a local path or
a file-like buffer. See the fsspec and backend storage implementation
docs for the set of allowed keys and values
.. versionadded:: 1.2.0
..versionchange:: 1.2.0
Returns the dataclass IOArgs.
"""
filepath_or_buffer = stringify_path(filepath_or_buffer)
# handle compression dict
compression_method, compression = get_compression_method(compression)
compression_method = infer_compression(filepath_or_buffer, compression_method)
# GH21227 internal compression is not used for non-binary handles.
if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode:
warnings.warn(
"compression has no effect when passing a non-binary object as input.",
RuntimeWarning,
stacklevel=find_stack_level(),
)
compression_method = None
compression = dict(compression, method=compression_method)
# bz2 and xz do not write the byte order mark for utf-16 and utf-32
# print a warning when writing such files
if (
"w" in mode
and compression_method in ["bz2", "xz"]
and encoding in ["utf-16", "utf-32"]
):
warnings.warn(
f"{compression} will not write the byte order mark for {encoding}",
UnicodeWarning,
)
# Use binary mode when converting path-like objects to file-like objects (fsspec)
# except when text mode is explicitly requested. The original mode is returned if
# fsspec is not used.
fsspec_mode = mode
if "t" not in fsspec_mode and "b" not in fsspec_mode:
fsspec_mode += "b"
if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
# TODO: fsspec can also handle HTTP via requests, but leaving this
# unchanged. using fsspec appears to break the ability to infer if the
# server responded with gzipped data
storage_options = storage_options or {}
# waiting until now for importing to match intended lazy logic of
# urlopen function defined elsewhere in this module
import urllib.request
# assuming storage_options is to be interpreted as headers
req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
with urlopen(req_info) as req:
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
# Override compression based on Content-Encoding header
compression = {"method": "gzip"}
reader = BytesIO(req.read())
return IOArgs(
filepath_or_buffer=reader,
encoding=encoding,
compression=compression,
should_close=True,
mode=fsspec_mode,
)
if is_fsspec_url(filepath_or_buffer):
assert isinstance(
filepath_or_buffer, str
) # just to appease mypy for this branch
# two special-case s3-like protocols; these have special meaning in Hadoop,
# but are equivalent to just "s3" from fsspec's point of view
# cc #11071
if filepath_or_buffer.startswith("s3a://"):
filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://")
if filepath_or_buffer.startswith("s3n://"):
filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://")
fsspec = import_optional_dependency("fsspec")
# If botocore is installed we fallback to reading with anon=True
# to allow reads from public buckets
err_types_to_retry_with_anon: list[Any] = []
try:
import_optional_dependency("botocore")
from botocore.exceptions import (
ClientError,
NoCredentialsError,
)
err_types_to_retry_with_anon = [
ClientError,
NoCredentialsError,
PermissionError,
]
except ImportError:
pass
try:
file_obj = fsspec.open(
filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
).open()
# GH 34626 Reads from Public Buckets without Credentials needs anon=True
except tuple(err_types_to_retry_with_anon):
if storage_options is None:
storage_options = {"anon": True}
else:
# don't mutate user input.
storage_options = dict(storage_options)
storage_options["anon"] = True
file_obj = fsspec.open(
filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
).open()
return IOArgs(
filepath_or_buffer=file_obj,
encoding=encoding,
compression=compression,
should_close=True,
mode=fsspec_mode,
)
elif storage_options:
raise ValueError(
"storage_options passed with file object or non-fsspec file path"
)
if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
return IOArgs(
filepath_or_buffer=_expand_user(filepath_or_buffer),
encoding=encoding,
compression=compression,
should_close=False,
mode=mode,
)
# is_file_like requires (read | write) & __iter__ but __iter__ is only
# needed for read_csv(engine=python)
if not (
hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write")
):
msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
raise ValueError(msg)
return IOArgs(
filepath_or_buffer=filepath_or_buffer,
encoding=encoding,
compression=compression,
should_close=False,
mode=mode,
)
def file_path_to_url(path: str) -> str:
"""
converts an absolute native path to a FILE URL.
Parameters
----------
path : a path in native format
Returns
-------
a valid FILE URL
"""
# lazify expensive import (~30ms)
from urllib.request import pathname2url
return urljoin("file:", pathname2url(path))
_compression_to_extension = {
"gzip": ".gz",
"bz2": ".bz2",
"zip": ".zip",
"xz": ".xz",
"zstd": ".zst",
}
def get_compression_method(
compression: CompressionOptions,
) -> tuple[str | None, CompressionDict]:
"""
Simplifies a compression argument to a compression method string and
a mapping containing additional arguments.
Parameters
----------
compression : str or mapping
If string, specifies the compression method. If mapping, value at key
'method' specifies compression method.
Returns
-------
tuple of ({compression method}, Optional[str]
{compression arguments}, Dict[str, Any])
Raises
------
ValueError on mapping missing 'method' key
"""
compression_method: str | None
if isinstance(compression, Mapping):
compression_args = dict(compression)
try:
compression_method = compression_args.pop("method")
except KeyError as err:
raise ValueError("If mapping, compression must have key 'method'") from err
else:
compression_args = {}
compression_method = compression
return compression_method, compression_args
@doc(compression_options=_shared_docs["compression_options"] % "filepath_or_buffer")
def infer_compression(
filepath_or_buffer: FilePath | BaseBuffer, compression: str | None
) -> str | None:
"""
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
compression method is returned unchanged, unless it's invalid, in which
case an error is raised.
Parameters
----------
filepath_or_buffer : str or file handle
File path or object.
{compression_options}
.. versionchanged:: 1.4.0 Zstandard support.
Returns
-------
string or None
Raises
------
ValueError on invalid compression specified.
"""
if compression is None:
return None
# Infer compression
if compression == "infer":
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True)
if not isinstance(filepath_or_buffer, str):
# Cannot infer compression of a buffer, assume no compression
return None
# Infer compression from the filename/URL extension
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.lower().endswith(extension):
return compression
return None
# Compression has been specified. Check that it's valid
if compression in _compression_to_extension:
return compression
# https://github.com/python/mypy/issues/5492
# Unsupported operand types for + ("List[Optional[str]]" and "List[str]")
valid = ["infer", None] + sorted(
_compression_to_extension
) # type: ignore[operator]
msg = (
f"Unrecognized compression type: {compression}\n"
f"Valid compression types are {valid}"
)
raise ValueError(msg)
def check_parent_directory(path: Path | str) -> None:
"""
Check if parent directory of a file exists, raise OSError if it does not
Parameters
----------
path: Path or str
Path to check parent directory of
"""
parent = Path(path).parent
if not parent.is_dir():
raise OSError(rf"Cannot save file into a non-existent directory: '{parent}'")
@overload
def get_handle(
path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: Literal[False],
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[bytes]:
...
@overload
def get_handle(
path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = ...,
compression: CompressionOptions = ...,
memory_map: bool = ...,
is_text: Literal[True] = ...,
errors: str | None = ...,
storage_options: StorageOptions = ...,
) -> IOHandles[str]:
...
@doc(compression_options=_shared_docs["compression_options"] % "path_or_buf")
def get_handle(
path_or_buf: FilePath | BaseBuffer,
mode: str,
*,
encoding: str | None = None,
compression: CompressionOptions = None,
memory_map: bool = False,
is_text: bool = True,
errors: str | None = None,
storage_options: StorageOptions = None,
) -> IOHandles[str] | IOHandles[bytes]:
"""
Get file handle for given path/buffer and mode.
Parameters
----------
path_or_buf : str or file handle
File path or object.
mode : str
Mode to open path_or_buf with.
encoding : str or None
Encoding to use.
{compression_options}
.. versionchanged:: 1.0.0
May now be a dict with key 'method' as compression mode
and other keys as compression options if compression
mode is 'zip'.
.. versionchanged:: 1.1.0
Passing compression options as keys in dict is now
supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'.
.. versionchanged:: 1.4.0 Zstandard support.
memory_map : bool, default False
See parsers._parser_params for more information.
is_text : bool, default True
Whether the type of the content passed to the file/buffer is string or
bytes. This is not the same as `"b" not in mode`. If a string content is
passed to a binary file/buffer, a wrapper is inserted.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
storage_options: StorageOptions = None
Passed to _get_filepath_or_buffer
.. versionchanged:: 1.2.0
Returns the dataclass IOHandles
"""
# Windows does not default to utf-8. Set to utf-8 for a consistent behavior
encoding = encoding or "utf-8"
# read_csv does not know whether the buffer is opened in binary/text mode
if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
mode += "b"
# validate encoding and errors
codecs.lookup(encoding)
if isinstance(errors, str):
codecs.lookup_error(errors)
# open URLs
ioargs = _get_filepath_or_buffer(
path_or_buf,
encoding=encoding,
compression=compression,
mode=mode,
storage_options=storage_options,
)
handle = ioargs.filepath_or_buffer
handles: list[BaseBuffer]
# memory mapping needs to be the first step
handle, memory_map, handles = _maybe_memory_map(
handle,
memory_map,
ioargs.encoding,
ioargs.mode,
errors,
ioargs.compression["method"] not in _compression_to_extension,
)
is_path = isinstance(handle, str)
compression_args = dict(ioargs.compression)
compression = compression_args.pop("method")
# Only for write methods
if "r" not in mode and is_path:
check_parent_directory(str(handle))
if compression:
if compression != "zstd":
# compression libraries do not like an explicit text-mode
ioargs.mode = ioargs.mode.replace("t", "")
elif compression == "zstd" and "b" not in ioargs.mode:
# python-zstandard defaults to text mode, but we always expect
# compression libraries to use binary mode.
ioargs.mode += "b"
# GZ Compression
if compression == "gzip":
if is_path:
assert isinstance(handle, str)
# error: Incompatible types in assignment (expression has type
# "GzipFile", variable has type "Union[str, BaseBuffer]")
handle = gzip.GzipFile( # type: ignore[assignment]
filename=handle,
mode=ioargs.mode,
**compression_args,
)
else:
handle = gzip.GzipFile(
# No overload variant of "GzipFile" matches argument types
# "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
fileobj=handle, # type: ignore[call-overload]
mode=ioargs.mode,
**compression_args,
)
# BZ Compression
elif compression == "bz2":
# No overload variant of "BZ2File" matches argument types
# "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
handle = bz2.BZ2File( # type: ignore[call-overload]
handle,
mode=ioargs.mode,
**compression_args,
)
# ZIP Compression
elif compression == "zip":
# error: Argument 1 to "_BytesZipFile" has incompatible type "Union[str,
# BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
# ReadBuffer[bytes], WriteBuffer[bytes]]"
handle = _BytesZipFile(
handle, ioargs.mode, **compression_args # type: ignore[arg-type]
)
if handle.mode == "r":
handles.append(handle)
zip_names = handle.namelist()
if len(zip_names) == 1:
handle = handle.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError(f"Zero files found in ZIP file {path_or_buf}")
else:
raise ValueError(
"Multiple files found in ZIP file. "
f"Only one file per ZIP: {zip_names}"
)
# XZ Compression
elif compression == "xz":
handle = get_lzma_file()(handle, ioargs.mode)
# Zstd Compression
elif compression == "zstd":
zstd = import_optional_dependency("zstandard")
if "r" in ioargs.mode:
open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)}
else:
open_args = {"cctx": zstd.ZstdCompressor(**compression_args)}
handle = zstd.open(
handle,
mode=ioargs.mode,
**open_args,
)
# Unrecognized Compression
else:
msg = f"Unrecognized compression type: {compression}"
raise ValueError(msg)
assert not isinstance(handle, str)
handles.append(handle)
elif isinstance(handle, str):
# Check whether the filename is to be opened in binary mode.
# Binary mode does not support 'encoding' and 'newline'.
if ioargs.encoding and "b" not in ioargs.mode:
# Encoding
handle = open(
handle,
ioargs.mode,
encoding=ioargs.encoding,
errors=errors,
newline="",
)
else:
# Binary mode
handle = open(handle, ioargs.mode)
handles.append(handle)
# Convert BytesIO or file objects passed with an encoding
is_wrapped = False
if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
# not added to handles as it does not open/buffer resources
handle = _BytesIOWrapper(
handle,
encoding=ioargs.encoding,
)
elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
handle = TextIOWrapper(
# error: Argument 1 to "TextIOWrapper" has incompatible type
# "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
# expected "IO[bytes]"
_IOWrapper(handle), # type: ignore[arg-type]
encoding=ioargs.encoding,
errors=errors,
newline="",
)
handles.append(handle)
# only marked as wrapped when the caller provided a handle
is_wrapped = not (
isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close
)
if "r" in ioargs.mode and not hasattr(handle, "read"):
raise TypeError(
"Expected file path name or file-like object, "
f"got {type(ioargs.filepath_or_buffer)} type"
)
handles.reverse() # close the most recently added buffer first
if ioargs.should_close:
assert not isinstance(ioargs.filepath_or_buffer, str)
handles.append(ioargs.filepath_or_buffer)
return IOHandles(
# error: Argument "handle" to "IOHandles" has incompatible type
# "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes],
# typing.IO[Any]]"; expected "pandas._typing.IO[Any]"
handle=handle, # type: ignore[arg-type]
# error: Argument "created_handles" to "IOHandles" has incompatible type
# "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
created_handles=handles, # type: ignore[arg-type]
is_wrapped=is_wrapped,
is_mmap=memory_map,
compression=ioargs.compression,
)
# error: Definition of "__exit__" in base class "ZipFile" is incompatible with
# definition in base class "BytesIO" [misc]
# error: Definition of "__enter__" in base class "ZipFile" is incompatible with
# definition in base class "BytesIO" [misc]
# error: Definition of "__enter__" in base class "ZipFile" is incompatible with
# definition in base class "BinaryIO" [misc]
# error: Definition of "__enter__" in base class "ZipFile" is incompatible with
# definition in base class "IO" [misc]
# error: Definition of "read" in base class "ZipFile" is incompatible with
# definition in base class "BytesIO" [misc]
# error: Definition of "read" in base class "ZipFile" is incompatible with
# definition in base class "IO" [misc]
class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc]
"""
Wrapper for standard library class ZipFile and allow the returned file-like
handle to accept byte strings via `write` method.
BytesIO provides attributes of file-like object and ZipFile.writestr writes
bytes strings into a member of the archive.
"""
# GH 17778
def __init__(
self,
file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
mode: str,
archive_name: str | None = None,
**kwargs,
):
mode = mode.replace("b", "")
self.archive_name = archive_name
self.multiple_write_buffer: StringIO | BytesIO | None = None
kwargs_zip: dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED}
kwargs_zip.update(kwargs)
# error: Argument 1 to "__init__" of "ZipFile" has incompatible type
# "Union[_PathLike[str], Union[str, Union[IO[Any], RawIOBase, BufferedIOBase,
# TextIOBase, TextIOWrapper, mmap]]]"; expected "Union[Union[str,
# _PathLike[str]], IO[bytes]]"
super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type]
def infer_filename(self):
"""
If an explicit archive_name is not given, we still want the file inside the zip
file not to be named something.zip, because that causes confusion (GH39465).
"""
if isinstance(self.filename, (os.PathLike, str)):
filename = Path(self.filename)
if filename.suffix == ".zip":
return filename.with_suffix("").name
return filename.name
return None
def write(self, data):
# buffer multiple write calls, write on flush
if self.multiple_write_buffer is None:
self.multiple_write_buffer = (
BytesIO() if isinstance(data, bytes) else StringIO()
)
self.multiple_write_buffer.write(data)
def flush(self) -> None:
# write to actual handle and close write buffer
if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
return
# ZipFile needs a non-empty string
archive_name = self.archive_name or self.infer_filename() or "zip"
with self.multiple_write_buffer:
super().writestr(archive_name, self.multiple_write_buffer.getvalue())
def close(self):
self.flush()
super().close()
@property
def closed(self):
return self.fp is None
class _MMapWrapper(abc.Iterator):
"""
Wrapper for the Python's mmap class so that it can be properly read in
by Python's csv.reader class.
Parameters
----------
f : file object
File object to be mapped onto memory. Must support the 'fileno'
method or have an equivalent attribute
"""
def __init__(
self,
f: IO,
encoding: str = "utf-8",
errors: str = "strict",
decode: bool = True,
):
self.encoding = encoding
self.errors = errors
self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors)
self.decode = decode
self.attributes = {}
for attribute in ("seekable", "readable"):
if not hasattr(f, attribute):
continue
self.attributes[attribute] = getattr(f, attribute)()
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
def __getattr__(self, name: str):
if name in self.attributes:
return lambda: self.attributes[name]
return getattr(self.mmap, name)
def __iter__(self) -> _MMapWrapper:
return self
def read(self, size: int = -1) -> str | bytes:
# CSV c-engine uses read instead of iterating
content: bytes = self.mmap.read(size)
if self.decode and self.encoding != "utf-8":
# memory mapping is applied before compression. Encoding should
# be applied to the de-compressed data.
final = size == -1 or len(content) < size
return self.decoder.decode(content, final=final)
return content
def __next__(self) -> str:
newbytes = self.mmap.readline()
# readline returns bytes, not str, but Python's CSV reader
# expects str, so convert the output to str before continuing
newline = self.decoder.decode(newbytes)
# mmap doesn't raise if reading past the allocated
# data but instead returns an empty string, so raise
# if that is returned
if newline == "":
raise StopIteration
# IncrementalDecoder seems to push newline to the next line
return newline.lstrip("\n")
class _IOWrapper:
# TextIOWrapper is overly strict: it request that the buffer has seekable, readable,
# and writable. If we have a read-only buffer, we shouldn't need writable and vice
# versa. Some buffers, are seek/read/writ-able but they do not have the "-able"
# methods, e.g., tempfile.SpooledTemporaryFile.
# If a buffer does not have the above "-able" methods, we simple assume they are
# seek/read/writ-able.
def __init__(self, buffer: BaseBuffer):
self.buffer = buffer
def __getattr__(self, name: str):
return getattr(self.buffer, name)
def readable(self) -> bool:
if hasattr(self.buffer, "readable"):
# error: "BaseBuffer" has no attribute "readable"
return self.buffer.readable() # type: ignore[attr-defined]
return True
def seekable(self) -> bool:
if hasattr(self.buffer, "seekable"):
return self.buffer.seekable()
return True
def writable(self) -> bool:
if hasattr(self.buffer, "writable"):
# error: "BaseBuffer" has no attribute "writable"
return self.buffer.writable() # type: ignore[attr-defined]
return True
class _BytesIOWrapper:
# Wrapper that wraps a StringIO buffer and reads bytes from it
# Created for compat with pyarrow read_csv
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"):
self.buffer = buffer
self.encoding = encoding
# Because a character can be represented by more than 1 byte,
# it is possible that reading will produce more bytes than n
# We store the extra bytes in this overflow variable, and append the
# overflow to the front of the bytestring the next time reading is performed
self.overflow = b""
def __getattr__(self, attr: str):
return getattr(self.buffer, attr)
def read(self, n: int | None = -1) -> bytes:
assert self.buffer is not None
bytestring = self.buffer.read(n).encode(self.encoding)
# When n=-1/n greater than remaining bytes: Read entire file/rest of file
combined_bytestring = self.overflow + bytestring
if n is None or n < 0 or n >= len(combined_bytestring):
self.overflow = b""
return combined_bytestring
else:
to_return = combined_bytestring[:n]
self.overflow = combined_bytestring[n:]
return to_return
def _maybe_memory_map(
handle: str | BaseBuffer,
memory_map: bool,
encoding: str,
mode: str,
errors: str | None,
decode: bool,
) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]:
"""Try to memory map file/buffer."""
handles: list[BaseBuffer] = []
memory_map &= hasattr(handle, "fileno") or isinstance(handle, str)
if not memory_map:
return handle, memory_map, handles
# need to open the file first
if isinstance(handle, str):
if encoding and "b" not in mode:
# Encoding
handle = open(handle, mode, encoding=encoding, errors=errors, newline="")
else:
# Binary mode
handle = open(handle, mode)
handles.append(handle)
# error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any],
# RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]"
try:
wrapped = cast(
BaseBuffer,
_MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type]
)
finally:
for handle in reversed(handles):
# error: "BaseBuffer" has no attribute "close"
handle.close() # type: ignore[attr-defined]
handles.append(wrapped)
return wrapped, memory_map, handles
def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool:
"""Test whether file exists."""
exists = False
filepath_or_buffer = stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
return exists
try:
exists = os.path.exists(filepath_or_buffer)
# gh-5874: if the filepath is too long will raise here
except (TypeError, ValueError):
pass
return exists
def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool:
"""Whether the handle is opened in binary mode"""
# specified by user
if "t" in mode or "b" in mode:
return "b" in mode
# exceptions
text_classes = (
# classes that expect string but have 'b' in mode
codecs.StreamWriter,
codecs.StreamReader,
codecs.StreamReaderWriter,
)
if issubclass(type(handle), text_classes):
return False
return isinstance(handle, _get_binary_io_classes()) or "b" in getattr(
handle, "mode", mode
)
@functools.lru_cache
def _get_binary_io_classes() -> tuple[type, ...]:
"""IO classes that that expect bytes"""
binary_classes: tuple[type, ...] = (BufferedIOBase, RawIOBase)
# python-zstandard doesn't use any of the builtin base classes; instead we
# have to use the `zstd.ZstdDecompressionReader` class for isinstance checks.
# Unfortunately `zstd.ZstdDecompressionReader` isn't exposed by python-zstandard
# so we have to get it from a `zstd.ZstdDecompressor` instance.
# See also https://github.com/indygreg/python-zstandard/pull/165.
zstd = import_optional_dependency("zstandard", errors="ignore")
if zstd is not None:
with zstd.ZstdDecompressor().stream_reader(b"") as reader:
binary_classes += (type(reader),)
return binary_classes