2058 lines
67 KiB
Cython
2058 lines
67 KiB
Cython
# Copyright (c) 2012, Lambda Foundry, Inc.
|
|
# See LICENSE for the license
|
|
from csv import (
|
|
QUOTE_MINIMAL,
|
|
QUOTE_NONE,
|
|
QUOTE_NONNUMERIC,
|
|
)
|
|
from errno import ENOENT
|
|
import sys
|
|
import time
|
|
import warnings
|
|
|
|
from libc.stdlib cimport free
|
|
from libc.string cimport (
|
|
strcasecmp,
|
|
strlen,
|
|
strncpy,
|
|
)
|
|
|
|
import cython
|
|
from cython import Py_ssize_t
|
|
|
|
from cpython.bytes cimport (
|
|
PyBytes_AsString,
|
|
PyBytes_FromString,
|
|
)
|
|
from cpython.exc cimport (
|
|
PyErr_Fetch,
|
|
PyErr_Occurred,
|
|
)
|
|
from cpython.object cimport PyObject
|
|
from cpython.ref cimport (
|
|
Py_INCREF,
|
|
Py_XDECREF,
|
|
)
|
|
from cpython.unicode cimport (
|
|
PyUnicode_AsUTF8String,
|
|
PyUnicode_Decode,
|
|
PyUnicode_DecodeUTF8,
|
|
)
|
|
|
|
|
|
cdef extern from "Python.h":
|
|
object PyUnicode_FromString(char *v)
|
|
|
|
|
|
import numpy as np
|
|
|
|
cimport numpy as cnp
|
|
from numpy cimport (
|
|
float64_t,
|
|
int64_t,
|
|
ndarray,
|
|
uint8_t,
|
|
uint64_t,
|
|
)
|
|
|
|
cnp.import_array()
|
|
|
|
from pandas._libs cimport util
|
|
from pandas._libs.util cimport (
|
|
INT64_MAX,
|
|
INT64_MIN,
|
|
UINT64_MAX,
|
|
)
|
|
|
|
import pandas._libs.lib as lib
|
|
|
|
from pandas._libs.khash cimport (
|
|
kh_destroy_float64,
|
|
kh_destroy_str,
|
|
kh_destroy_str_starts,
|
|
kh_destroy_strbox,
|
|
kh_exist_str,
|
|
kh_float64_t,
|
|
kh_get_float64,
|
|
kh_get_str,
|
|
kh_get_str_starts_item,
|
|
kh_get_strbox,
|
|
kh_init_float64,
|
|
kh_init_str,
|
|
kh_init_str_starts,
|
|
kh_init_strbox,
|
|
kh_put_float64,
|
|
kh_put_str,
|
|
kh_put_str_starts_item,
|
|
kh_put_strbox,
|
|
kh_resize_float64,
|
|
kh_resize_str_starts,
|
|
kh_str_starts_t,
|
|
kh_str_t,
|
|
kh_strbox_t,
|
|
khiter_t,
|
|
)
|
|
|
|
from pandas.errors import (
|
|
EmptyDataError,
|
|
ParserError,
|
|
ParserWarning,
|
|
)
|
|
|
|
from pandas.core.dtypes.common import (
|
|
is_bool_dtype,
|
|
is_datetime64_dtype,
|
|
is_extension_array_dtype,
|
|
is_float_dtype,
|
|
is_integer_dtype,
|
|
is_object_dtype,
|
|
)
|
|
from pandas.core.dtypes.dtypes import CategoricalDtype
|
|
from pandas.core.dtypes.inference import is_dict_like
|
|
|
|
cdef:
|
|
float64_t INF = <float64_t>np.inf
|
|
float64_t NEGINF = -INF
|
|
int64_t DEFAULT_CHUNKSIZE = 256 * 1024
|
|
|
|
|
|
cdef extern from "headers/portable.h":
|
|
# I *think* this is here so that strcasecmp is defined on Windows
|
|
# so we don't get
|
|
# `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
|
|
# in Appveyor.
|
|
# In a sane world, the `from libc.string cimport` above would fail
|
|
# loudly.
|
|
pass
|
|
|
|
|
|
cdef extern from "parser/tokenizer.h":
|
|
|
|
ctypedef enum ParserState:
|
|
START_RECORD
|
|
START_FIELD
|
|
ESCAPED_CHAR
|
|
IN_FIELD
|
|
IN_QUOTED_FIELD
|
|
ESCAPE_IN_QUOTED_FIELD
|
|
QUOTE_IN_QUOTED_FIELD
|
|
EAT_CRNL
|
|
EAT_CRNL_NOP
|
|
EAT_WHITESPACE
|
|
EAT_COMMENT
|
|
EAT_LINE_COMMENT
|
|
WHITESPACE_LINE
|
|
SKIP_LINE
|
|
FINISHED
|
|
|
|
enum: ERROR_OVERFLOW
|
|
|
|
ctypedef enum BadLineHandleMethod:
|
|
ERROR,
|
|
WARN,
|
|
SKIP
|
|
|
|
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
|
|
int *status, const char *encoding_errors)
|
|
ctypedef int (*io_cleanup)(void *src)
|
|
|
|
ctypedef struct parser_t:
|
|
void *source
|
|
io_callback cb_io
|
|
io_cleanup cb_cleanup
|
|
|
|
int64_t chunksize # Number of bytes to prepare for each chunk
|
|
char *data # pointer to data to be processed
|
|
int64_t datalen # amount of data available
|
|
int64_t datapos
|
|
|
|
# where to write out tokenized data
|
|
char *stream
|
|
uint64_t stream_len
|
|
uint64_t stream_cap
|
|
|
|
# Store words in (potentially ragged) matrix for now, hmm
|
|
char **words
|
|
int64_t *word_starts # where we are in the stream
|
|
uint64_t words_len
|
|
uint64_t words_cap
|
|
uint64_t max_words_cap # maximum word cap encountered
|
|
|
|
char *pword_start # pointer to stream start of current field
|
|
int64_t word_start # position start of current field
|
|
|
|
int64_t *line_start # position in words for start of line
|
|
int64_t *line_fields # Number of fields in each line
|
|
uint64_t lines # Number of lines observed
|
|
uint64_t file_lines # Number of lines observed (with bad/skipped)
|
|
uint64_t lines_cap # Vector capacity
|
|
|
|
# Tokenizing stuff
|
|
ParserState state
|
|
int doublequote # is " represented by ""? */
|
|
char delimiter # field separator */
|
|
int delim_whitespace # consume tabs / spaces instead
|
|
char quotechar # quote character */
|
|
char escapechar # escape character */
|
|
char lineterminator
|
|
int skipinitialspace # ignore spaces following delimiter? */
|
|
int quoting # style of quoting to write */
|
|
|
|
char commentchar
|
|
int allow_embedded_newline
|
|
|
|
int usecols
|
|
|
|
Py_ssize_t expected_fields
|
|
BadLineHandleMethod on_bad_lines
|
|
|
|
# floating point options
|
|
char decimal
|
|
char sci
|
|
|
|
# thousands separator (comma, period)
|
|
char thousands
|
|
|
|
int header # Boolean: 1: has header, 0: no header
|
|
int64_t header_start # header row start
|
|
uint64_t header_end # header row end
|
|
|
|
void *skipset
|
|
PyObject *skipfunc
|
|
int64_t skip_first_N_rows
|
|
int64_t skipfooter
|
|
# pick one, depending on whether the converter requires GIL
|
|
float64_t (*double_converter)(const char *, char **,
|
|
char, char, char,
|
|
int, int *, int *) nogil
|
|
|
|
# error handling
|
|
char *warn_msg
|
|
char *error_msg
|
|
|
|
int64_t skip_empty_lines
|
|
|
|
ctypedef struct coliter_t:
|
|
char **words
|
|
int64_t *line_start
|
|
int64_t col
|
|
|
|
ctypedef struct uint_state:
|
|
int seen_sint
|
|
int seen_uint
|
|
int seen_null
|
|
|
|
void uint_state_init(uint_state *self)
|
|
int uint64_conflict(uint_state *self)
|
|
|
|
void coliter_setup(coliter_t *it, parser_t *parser,
|
|
int64_t i, int64_t start) nogil
|
|
void COLITER_NEXT(coliter_t, const char *) nogil
|
|
|
|
parser_t* parser_new()
|
|
|
|
int parser_init(parser_t *self) nogil
|
|
void parser_free(parser_t *self) nogil
|
|
void parser_del(parser_t *self) nogil
|
|
int parser_add_skiprow(parser_t *self, int64_t row)
|
|
|
|
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
|
|
|
|
void parser_set_default_options(parser_t *self)
|
|
|
|
int parser_consume_rows(parser_t *self, size_t nrows)
|
|
|
|
int parser_trim_buffers(parser_t *self)
|
|
|
|
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
|
|
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
|
|
|
|
int64_t str_to_int64(char *p_item, int64_t int_min,
|
|
int64_t int_max, int *error, char tsep) nogil
|
|
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
|
|
uint64_t uint_max, int *error, char tsep) nogil
|
|
|
|
float64_t xstrtod(const char *p, char **q, char decimal,
|
|
char sci, char tsep, int skip_trailing,
|
|
int *error, int *maybe_int) nogil
|
|
float64_t precise_xstrtod(const char *p, char **q, char decimal,
|
|
char sci, char tsep, int skip_trailing,
|
|
int *error, int *maybe_int) nogil
|
|
float64_t round_trip(const char *p, char **q, char decimal,
|
|
char sci, char tsep, int skip_trailing,
|
|
int *error, int *maybe_int) nogil
|
|
|
|
int to_boolean(const char *item, uint8_t *val) nogil
|
|
|
|
|
|
cdef extern from "parser/io.h":
|
|
void *new_rd_source(object obj) except NULL
|
|
|
|
int del_rd_source(void *src)
|
|
|
|
void* buffer_rd_bytes(void *source, size_t nbytes,
|
|
size_t *bytes_read, int *status, const char *encoding_errors)
|
|
|
|
|
|
cdef class TextReader:
|
|
"""
|
|
|
|
# source: StringIO or file object
|
|
|
|
..versionchange:: 1.2.0
|
|
removed 'compression', 'memory_map', and 'encoding' argument.
|
|
These arguments are outsourced to CParserWrapper.
|
|
'source' has to be a file handle.
|
|
"""
|
|
|
|
cdef:
|
|
parser_t *parser
|
|
object na_fvalues
|
|
object true_values, false_values
|
|
object handle
|
|
object orig_header
|
|
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
|
|
bint mangle_dupe_cols, allow_leading_cols
|
|
uint64_t parser_start # this is modified after __init__
|
|
list clocks
|
|
const char *encoding_errors
|
|
kh_str_starts_t *false_set
|
|
kh_str_starts_t *true_set
|
|
int64_t buffer_lines, skipfooter
|
|
list dtype_cast_order # list[np.dtype]
|
|
list names # can be None
|
|
set noconvert # set[int]
|
|
|
|
cdef public:
|
|
int64_t leading_cols, table_width
|
|
object delimiter # bytes or str
|
|
object converters
|
|
object na_values
|
|
list header # list[list[non-negative integers]]
|
|
object index_col
|
|
object skiprows
|
|
object dtype
|
|
object usecols
|
|
set unnamed_cols # set[str]
|
|
|
|
def __cinit__(self, source,
|
|
delimiter=b',', # bytes | str
|
|
header=0,
|
|
int64_t header_start=0,
|
|
uint64_t header_end=0,
|
|
index_col=None,
|
|
names=None,
|
|
tokenize_chunksize=DEFAULT_CHUNKSIZE,
|
|
bint delim_whitespace=False,
|
|
converters=None,
|
|
bint skipinitialspace=False,
|
|
escapechar=None, # bytes | str
|
|
bint doublequote=True,
|
|
quotechar=b'"',
|
|
quoting=0, # int
|
|
lineterminator=None, # bytes | str
|
|
comment=None,
|
|
decimal=b'.', # bytes | str
|
|
thousands=None, # bytes | str
|
|
dtype=None,
|
|
usecols=None,
|
|
on_bad_lines=ERROR,
|
|
bint na_filter=True,
|
|
na_values=None,
|
|
na_fvalues=None,
|
|
bint keep_default_na=True,
|
|
true_values=None,
|
|
false_values=None,
|
|
bint allow_leading_cols=True,
|
|
skiprows=None,
|
|
skipfooter=0, # int64_t
|
|
bint verbose=False,
|
|
bint mangle_dupe_cols=True,
|
|
float_precision=None,
|
|
bint skip_blank_lines=True,
|
|
encoding_errors=b"strict"):
|
|
|
|
# set encoding for native Python and C library
|
|
if isinstance(encoding_errors, str):
|
|
encoding_errors = encoding_errors.encode("utf-8")
|
|
elif encoding_errors is None:
|
|
encoding_errors = b"strict"
|
|
Py_INCREF(encoding_errors)
|
|
self.encoding_errors = PyBytes_AsString(encoding_errors)
|
|
|
|
self.parser = parser_new()
|
|
self.parser.chunksize = tokenize_chunksize
|
|
|
|
self.mangle_dupe_cols = mangle_dupe_cols
|
|
|
|
# For timekeeping
|
|
self.clocks = []
|
|
|
|
self.parser.usecols = (usecols is not None)
|
|
|
|
self._setup_parser_source(source)
|
|
parser_set_default_options(self.parser)
|
|
|
|
parser_init(self.parser)
|
|
|
|
if delim_whitespace:
|
|
self.parser.delim_whitespace = delim_whitespace
|
|
else:
|
|
if len(delimiter) > 1:
|
|
raise ValueError('only length-1 separators excluded right now')
|
|
self.parser.delimiter = <char>ord(delimiter)
|
|
|
|
# ----------------------------------------
|
|
# parser options
|
|
|
|
self.parser.doublequote = doublequote
|
|
self.parser.skipinitialspace = skipinitialspace
|
|
self.parser.skip_empty_lines = skip_blank_lines
|
|
|
|
if lineterminator is not None:
|
|
if len(lineterminator) != 1:
|
|
raise ValueError('Only length-1 line terminators supported')
|
|
self.parser.lineterminator = <char>ord(lineterminator)
|
|
|
|
if len(decimal) != 1:
|
|
raise ValueError('Only length-1 decimal markers supported')
|
|
self.parser.decimal = <char>ord(decimal)
|
|
|
|
if thousands is not None:
|
|
if len(thousands) != 1:
|
|
raise ValueError('Only length-1 thousands markers supported')
|
|
self.parser.thousands = <char>ord(thousands)
|
|
|
|
if escapechar is not None:
|
|
if len(escapechar) != 1:
|
|
raise ValueError('Only length-1 escapes supported')
|
|
self.parser.escapechar = <char>ord(escapechar)
|
|
|
|
self._set_quoting(quotechar, quoting)
|
|
|
|
dtype_order = ['int64', 'float64', 'bool', 'object']
|
|
if quoting == QUOTE_NONNUMERIC:
|
|
# consistent with csv module semantics, cast all to float
|
|
dtype_order = dtype_order[1:]
|
|
self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
|
|
|
|
if comment is not None:
|
|
if len(comment) > 1:
|
|
raise ValueError('Only length-1 comment characters supported')
|
|
self.parser.commentchar = <char>ord(comment)
|
|
|
|
self.parser.on_bad_lines = on_bad_lines
|
|
|
|
self.skiprows = skiprows
|
|
if skiprows is not None:
|
|
self._make_skiprow_set()
|
|
|
|
self.skipfooter = skipfooter
|
|
|
|
# suboptimal
|
|
if usecols is not None:
|
|
self.has_usecols = 1
|
|
# GH-20558, validate usecols at higher level and only pass clean
|
|
# usecols into TextReader.
|
|
self.usecols = usecols
|
|
|
|
# TODO: XXX?
|
|
if skipfooter > 0:
|
|
self.parser.on_bad_lines = SKIP
|
|
|
|
self.delimiter = delimiter
|
|
|
|
self.na_values = na_values
|
|
if na_fvalues is None:
|
|
na_fvalues = set()
|
|
self.na_fvalues = na_fvalues
|
|
|
|
self.true_values = _maybe_encode(true_values) + _true_values
|
|
self.false_values = _maybe_encode(false_values) + _false_values
|
|
|
|
self.true_set = kset_from_list(self.true_values)
|
|
self.false_set = kset_from_list(self.false_values)
|
|
|
|
self.keep_default_na = keep_default_na
|
|
self.converters = converters
|
|
self.na_filter = na_filter
|
|
|
|
self.verbose = verbose
|
|
|
|
if float_precision == "round_trip":
|
|
# see gh-15140
|
|
self.parser.double_converter = round_trip
|
|
elif float_precision == "legacy":
|
|
self.parser.double_converter = xstrtod
|
|
elif float_precision == "high" or float_precision is None:
|
|
self.parser.double_converter = precise_xstrtod
|
|
else:
|
|
raise ValueError(f'Unrecognized float_precision option: '
|
|
f'{float_precision}')
|
|
|
|
# Caller is responsible for ensuring we have one of
|
|
# - None
|
|
# - DtypeObj
|
|
# - dict[Any, DtypeObj]
|
|
self.dtype = dtype
|
|
|
|
# XXX
|
|
self.noconvert = set()
|
|
|
|
self.index_col = index_col
|
|
|
|
# ----------------------------------------
|
|
# header stuff
|
|
|
|
self.allow_leading_cols = allow_leading_cols
|
|
self.leading_cols = 0 # updated in _get_header
|
|
|
|
# TODO: no header vs. header is not the first row
|
|
self.has_mi_columns = 0
|
|
self.orig_header = header
|
|
if header is None:
|
|
# sentinel value
|
|
self.parser.header_start = -1
|
|
self.parser.header_end = -1
|
|
self.parser.header = -1
|
|
self.parser_start = 0
|
|
prelim_header = []
|
|
else:
|
|
if isinstance(header, list):
|
|
if len(header) > 1:
|
|
# need to artificially skip the final line
|
|
# which is still a header line
|
|
header = list(header)
|
|
header.append(header[-1] + 1)
|
|
self.parser.header_end = header[-1]
|
|
self.has_mi_columns = 1
|
|
else:
|
|
self.parser.header_end = header[0]
|
|
|
|
self.parser_start = header[-1] + 1
|
|
self.parser.header_start = header[0]
|
|
self.parser.header = header[0]
|
|
prelim_header = header
|
|
else:
|
|
self.parser.header_start = header
|
|
self.parser.header_end = header
|
|
self.parser_start = header + 1
|
|
self.parser.header = header
|
|
prelim_header = [header]
|
|
|
|
self.names = names
|
|
header, table_width, unnamed_cols = self._get_header(prelim_header)
|
|
# header, table_width, and unnamed_cols are set here, never changed
|
|
self.header = header
|
|
self.table_width = table_width
|
|
self.unnamed_cols = unnamed_cols
|
|
|
|
if not self.table_width:
|
|
raise EmptyDataError("No columns to parse from file")
|
|
|
|
# Compute buffer_lines as function of table width.
|
|
heuristic = 2**20 // self.table_width
|
|
self.buffer_lines = 1
|
|
while self.buffer_lines * 2 < heuristic:
|
|
self.buffer_lines *= 2
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
pass
|
|
|
|
def __dealloc__(self):
|
|
_close(self)
|
|
parser_del(self.parser)
|
|
|
|
def close(self):
|
|
_close(self)
|
|
|
|
def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
|
|
if not isinstance(quoting, int):
|
|
raise TypeError('"quoting" must be an integer')
|
|
|
|
if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
|
|
raise TypeError('bad "quoting" value')
|
|
|
|
if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
|
|
dtype = type(quote_char).__name__
|
|
raise TypeError(f'"quotechar" must be string, not {dtype}')
|
|
|
|
if quote_char is None or quote_char == '':
|
|
if quoting != QUOTE_NONE:
|
|
raise TypeError("quotechar must be set if quoting enabled")
|
|
self.parser.quoting = quoting
|
|
self.parser.quotechar = -1
|
|
elif len(quote_char) > 1: # 0-len case handled earlier
|
|
raise TypeError('"quotechar" must be a 1-character string')
|
|
else:
|
|
self.parser.quoting = quoting
|
|
self.parser.quotechar = <char>ord(quote_char)
|
|
|
|
cdef _make_skiprow_set(self):
|
|
if util.is_integer_object(self.skiprows):
|
|
parser_set_skipfirstnrows(self.parser, self.skiprows)
|
|
elif not callable(self.skiprows):
|
|
for i in self.skiprows:
|
|
parser_add_skiprow(self.parser, i)
|
|
else:
|
|
self.parser.skipfunc = <PyObject *>self.skiprows
|
|
|
|
cdef _setup_parser_source(self, source):
|
|
cdef:
|
|
void *ptr
|
|
|
|
ptr = new_rd_source(source)
|
|
self.parser.source = ptr
|
|
self.parser.cb_io = &buffer_rd_bytes
|
|
self.parser.cb_cleanup = &del_rd_source
|
|
|
|
cdef _get_header(self, list prelim_header):
|
|
# header is now a list of lists, so field_count should use header[0]
|
|
#
|
|
# modifies:
|
|
# self.parser attributes
|
|
# self.parser_start
|
|
# self.leading_cols
|
|
|
|
cdef:
|
|
Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
|
|
char *word
|
|
str name, old_name
|
|
uint64_t hr, data_line = 0
|
|
list header = []
|
|
set unnamed_cols = set()
|
|
|
|
if self.parser.header_start >= 0:
|
|
|
|
# Header is in the file
|
|
for level, hr in enumerate(prelim_header):
|
|
|
|
this_header = []
|
|
|
|
if self.parser.lines < hr + 1:
|
|
self._tokenize_rows(hr + 2)
|
|
|
|
if self.parser.lines == 0:
|
|
field_count = 0
|
|
start = self.parser.line_start[0]
|
|
|
|
# e.g., if header=3 and file only has 2 lines
|
|
elif (self.parser.lines < hr + 1
|
|
and not isinstance(self.orig_header, list)) or (
|
|
self.parser.lines < hr):
|
|
msg = self.orig_header
|
|
if isinstance(msg, list):
|
|
joined = ','.join(str(m) for m in msg)
|
|
msg = f"[{joined}], len of {len(msg)},"
|
|
raise ParserError(
|
|
f'Passed header={msg} but only '
|
|
f'{self.parser.lines} lines in file')
|
|
|
|
else:
|
|
field_count = self.parser.line_fields[hr]
|
|
start = self.parser.line_start[hr]
|
|
|
|
unnamed_count = 0
|
|
unnamed_col_indices = []
|
|
|
|
for i in range(field_count):
|
|
word = self.parser.words[start + i]
|
|
|
|
name = PyUnicode_DecodeUTF8(word, strlen(word),
|
|
self.encoding_errors)
|
|
|
|
if name == '':
|
|
if self.has_mi_columns:
|
|
name = f'Unnamed: {i}_level_{level}'
|
|
else:
|
|
name = f'Unnamed: {i}'
|
|
|
|
unnamed_count += 1
|
|
unnamed_col_indices.append(i)
|
|
|
|
this_header.append(name)
|
|
|
|
if not self.has_mi_columns and self.mangle_dupe_cols:
|
|
# Ensure that regular columns are used before unnamed ones
|
|
# to keep given names and mangle unnamed columns
|
|
col_loop_order = [i for i in range(len(this_header))
|
|
if i not in unnamed_col_indices
|
|
] + unnamed_col_indices
|
|
counts = {}
|
|
|
|
for i in col_loop_order:
|
|
col = this_header[i]
|
|
old_col = col
|
|
cur_count = counts.get(col, 0)
|
|
|
|
if cur_count > 0:
|
|
while cur_count > 0:
|
|
counts[old_col] = cur_count + 1
|
|
col = f'{old_col}.{cur_count}'
|
|
if col in this_header:
|
|
cur_count += 1
|
|
else:
|
|
cur_count = counts.get(col, 0)
|
|
|
|
if (
|
|
self.dtype is not None
|
|
and is_dict_like(self.dtype)
|
|
and self.dtype.get(old_col) is not None
|
|
and self.dtype.get(col) is None
|
|
):
|
|
self.dtype.update({col: self.dtype.get(old_col)})
|
|
|
|
this_header[i] = col
|
|
counts[col] = cur_count + 1
|
|
|
|
if self.has_mi_columns:
|
|
|
|
# If we have grabbed an extra line, but it's not in our
|
|
# format, save in the buffer, and create an blank extra
|
|
# line for the rest of the parsing code.
|
|
if hr == prelim_header[-1]:
|
|
lc = len(this_header)
|
|
ic = (len(self.index_col) if self.index_col
|
|
is not None else 0)
|
|
|
|
# if wrong number of blanks or no index, not our format
|
|
if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
|
|
hr -= 1
|
|
self.parser_start -= 1
|
|
this_header = [None] * lc
|
|
|
|
data_line = hr + 1
|
|
header.append(this_header)
|
|
unnamed_cols.update({this_header[i] for i in unnamed_col_indices})
|
|
|
|
if self.names is not None:
|
|
header = [self.names]
|
|
|
|
elif self.names is not None:
|
|
# Names passed
|
|
if self.parser.lines < 1:
|
|
self._tokenize_rows(1)
|
|
|
|
header = [self.names]
|
|
|
|
if self.parser.lines < 1:
|
|
field_count = len(header[0])
|
|
else:
|
|
field_count = self.parser.line_fields[data_line]
|
|
|
|
# Enforce this unless usecols
|
|
if not self.has_usecols:
|
|
self.parser.expected_fields = max(field_count, len(self.names))
|
|
else:
|
|
# No header passed nor to be found in the file
|
|
if self.parser.lines < 1:
|
|
self._tokenize_rows(1)
|
|
|
|
return None, self.parser.line_fields[0], unnamed_cols
|
|
|
|
# Corner case, not enough lines in the file
|
|
if self.parser.lines < data_line + 1:
|
|
field_count = len(header[0])
|
|
else: # not self.has_usecols:
|
|
|
|
field_count = self.parser.line_fields[data_line]
|
|
|
|
# #2981
|
|
if self.names is not None:
|
|
field_count = max(field_count, len(self.names))
|
|
|
|
passed_count = len(header[0])
|
|
|
|
if (self.has_usecols and self.allow_leading_cols and
|
|
not callable(self.usecols)):
|
|
nuse = len(self.usecols)
|
|
if nuse == passed_count:
|
|
self.leading_cols = 0
|
|
elif self.names is None and nuse < passed_count:
|
|
self.leading_cols = field_count - passed_count
|
|
elif passed_count != field_count:
|
|
raise ValueError('Number of passed names did not match number of '
|
|
'header fields in the file')
|
|
# oh boy, #2442, #2981
|
|
elif self.allow_leading_cols and passed_count < field_count:
|
|
self.leading_cols = field_count - passed_count
|
|
|
|
return header, field_count, unnamed_cols
|
|
|
|
def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]:
|
|
"""
|
|
rows=None --> read all rows
|
|
"""
|
|
# Don't care about memory usage
|
|
columns = self._read_rows(rows, 1)
|
|
|
|
return columns
|
|
|
|
def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
|
|
"""
|
|
rows=None --> read all rows
|
|
"""
|
|
# Conserve intermediate space
|
|
# Caller is responsible for concatenating chunks,
|
|
# see c_parser_wrapper._concatenate_chunks
|
|
cdef:
|
|
size_t rows_read = 0
|
|
list chunks = []
|
|
|
|
if rows is None:
|
|
while True:
|
|
try:
|
|
chunk = self._read_rows(self.buffer_lines, 0)
|
|
if len(chunk) == 0:
|
|
break
|
|
except StopIteration:
|
|
break
|
|
else:
|
|
chunks.append(chunk)
|
|
else:
|
|
while rows_read < rows:
|
|
try:
|
|
crows = min(self.buffer_lines, rows - rows_read)
|
|
|
|
chunk = self._read_rows(crows, 0)
|
|
if len(chunk) == 0:
|
|
break
|
|
|
|
rows_read += len(list(chunk.values())[0])
|
|
except StopIteration:
|
|
break
|
|
else:
|
|
chunks.append(chunk)
|
|
|
|
parser_trim_buffers(self.parser)
|
|
|
|
if len(chunks) == 0:
|
|
raise StopIteration
|
|
|
|
return chunks
|
|
|
|
cdef _tokenize_rows(self, size_t nrows):
|
|
cdef:
|
|
int status
|
|
|
|
with nogil:
|
|
status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
|
|
|
|
if self.parser.warn_msg != NULL:
|
|
print(self.parser.warn_msg, file=sys.stderr)
|
|
free(self.parser.warn_msg)
|
|
self.parser.warn_msg = NULL
|
|
|
|
if status < 0:
|
|
raise_parser_error('Error tokenizing data', self.parser)
|
|
|
|
# -> dict[int, "ArrayLike"]
|
|
cdef _read_rows(self, rows, bint trim):
|
|
cdef:
|
|
int64_t buffered_lines
|
|
int64_t irows
|
|
|
|
self._start_clock()
|
|
|
|
if rows is not None:
|
|
irows = rows
|
|
buffered_lines = self.parser.lines - self.parser_start
|
|
if buffered_lines < irows:
|
|
self._tokenize_rows(irows - buffered_lines)
|
|
|
|
if self.skipfooter > 0:
|
|
raise ValueError('skipfooter can only be used to read '
|
|
'the whole file')
|
|
else:
|
|
with nogil:
|
|
status = tokenize_all_rows(self.parser, self.encoding_errors)
|
|
|
|
if self.parser.warn_msg != NULL:
|
|
print(self.parser.warn_msg, file=sys.stderr)
|
|
free(self.parser.warn_msg)
|
|
self.parser.warn_msg = NULL
|
|
|
|
if status < 0:
|
|
raise_parser_error('Error tokenizing data', self.parser)
|
|
|
|
if self.parser_start >= self.parser.lines:
|
|
raise StopIteration
|
|
self._end_clock('Tokenization')
|
|
|
|
self._start_clock()
|
|
columns = self._convert_column_data(rows)
|
|
self._end_clock('Type conversion')
|
|
self._start_clock()
|
|
if len(columns) > 0:
|
|
rows_read = len(list(columns.values())[0])
|
|
# trim
|
|
parser_consume_rows(self.parser, rows_read)
|
|
if trim:
|
|
parser_trim_buffers(self.parser)
|
|
self.parser_start -= rows_read
|
|
|
|
self._end_clock('Parser memory cleanup')
|
|
|
|
return columns
|
|
|
|
cdef _start_clock(self):
|
|
self.clocks.append(time.time())
|
|
|
|
cdef _end_clock(self, str what):
|
|
if self.verbose:
|
|
elapsed = time.time() - self.clocks.pop(-1)
|
|
print(f'{what} took: {elapsed * 1000:.2f} ms')
|
|
|
|
def set_noconvert(self, i: int) -> None:
|
|
self.noconvert.add(i)
|
|
|
|
def remove_noconvert(self, i: int) -> None:
|
|
self.noconvert.remove(i)
|
|
|
|
def _convert_column_data(self, rows: int | None) -> dict[int, "ArrayLike"]:
|
|
cdef:
|
|
int64_t i
|
|
int nused
|
|
kh_str_starts_t *na_hashset = NULL
|
|
int64_t start, end
|
|
object name, na_flist, col_dtype = None
|
|
bint na_filter = 0
|
|
int64_t num_cols
|
|
dict result
|
|
|
|
start = self.parser_start
|
|
|
|
if rows is None:
|
|
end = self.parser.lines
|
|
else:
|
|
end = min(start + rows, self.parser.lines)
|
|
|
|
num_cols = -1
|
|
# Py_ssize_t cast prevents build warning
|
|
for i in range(<Py_ssize_t>self.parser.lines):
|
|
num_cols = (num_cols < self.parser.line_fields[i]) * \
|
|
self.parser.line_fields[i] + \
|
|
(num_cols >= self.parser.line_fields[i]) * num_cols
|
|
|
|
usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
|
|
names_larger_num_cols = (self.names and
|
|
len(self.names) - self.leading_cols > num_cols)
|
|
|
|
if self.table_width - self.leading_cols > num_cols:
|
|
if (usecols_not_callable_and_exists
|
|
and self.table_width - self.leading_cols < len(self.usecols)
|
|
or names_larger_num_cols):
|
|
raise ParserError(f"Too many columns specified: expected "
|
|
f"{self.table_width - self.leading_cols} "
|
|
f"and found {num_cols}")
|
|
|
|
if (usecols_not_callable_and_exists and
|
|
all(isinstance(u, int) for u in self.usecols)):
|
|
missing_usecols = [col for col in self.usecols if col >= num_cols]
|
|
if missing_usecols:
|
|
warnings.warn(
|
|
"Defining usecols with out of bounds indices is deprecated "
|
|
"and will raise a ParserError in a future version.",
|
|
FutureWarning,
|
|
stacklevel=6,
|
|
)
|
|
|
|
results = {}
|
|
nused = 0
|
|
for i in range(self.table_width):
|
|
if i < self.leading_cols:
|
|
# Pass through leading columns always
|
|
name = i
|
|
elif (self.usecols and not callable(self.usecols) and
|
|
nused == len(self.usecols)):
|
|
# Once we've gathered all requested columns, stop. GH5766
|
|
break
|
|
else:
|
|
name = self._get_column_name(i, nused)
|
|
usecols = set()
|
|
if callable(self.usecols):
|
|
if self.usecols(name):
|
|
usecols = {i}
|
|
else:
|
|
usecols = self.usecols
|
|
if self.has_usecols and not (i in usecols or
|
|
name in usecols):
|
|
continue
|
|
nused += 1
|
|
|
|
conv = self._get_converter(i, name)
|
|
|
|
col_dtype = None
|
|
if self.dtype is not None:
|
|
if isinstance(self.dtype, dict):
|
|
if name in self.dtype:
|
|
col_dtype = self.dtype[name]
|
|
elif i in self.dtype:
|
|
col_dtype = self.dtype[i]
|
|
else:
|
|
if self.dtype.names:
|
|
# structured array
|
|
col_dtype = np.dtype(self.dtype.descr[i][1])
|
|
else:
|
|
col_dtype = self.dtype
|
|
|
|
if conv:
|
|
if col_dtype is not None:
|
|
warnings.warn((f"Both a converter and dtype were specified "
|
|
f"for column {name} - only the converter will "
|
|
f"be used."), ParserWarning,
|
|
stacklevel=5)
|
|
results[i] = _apply_converter(conv, self.parser, i, start, end)
|
|
continue
|
|
|
|
# Collect the list of NaN values associated with the column.
|
|
# If we aren't supposed to do that, or none are collected,
|
|
# we set `na_filter` to `0` (`1` otherwise).
|
|
na_flist = set()
|
|
|
|
if self.na_filter:
|
|
na_list, na_flist = self._get_na_list(i, name)
|
|
if na_list is None:
|
|
na_filter = 0
|
|
else:
|
|
na_filter = 1
|
|
na_hashset = kset_from_list(na_list)
|
|
else:
|
|
na_filter = 0
|
|
|
|
# Attempt to parse tokens and infer dtype of the column.
|
|
# Should return as the desired dtype (inferred or specified).
|
|
try:
|
|
col_res, na_count = self._convert_tokens(
|
|
i, start, end, name, na_filter, na_hashset,
|
|
na_flist, col_dtype)
|
|
finally:
|
|
# gh-21353
|
|
#
|
|
# Cleanup the NaN hash that we generated
|
|
# to avoid memory leaks.
|
|
if na_filter:
|
|
self._free_na_set(na_hashset)
|
|
|
|
# don't try to upcast EAs
|
|
if na_count > 0 and not is_extension_array_dtype(col_dtype):
|
|
col_res = _maybe_upcast(col_res)
|
|
|
|
if col_res is None:
|
|
raise ParserError(f'Unable to parse column {i}')
|
|
|
|
results[i] = col_res
|
|
|
|
self.parser_start += end - start
|
|
|
|
return results
|
|
|
|
# -> tuple["ArrayLike", int]:
|
|
cdef inline _convert_tokens(self, Py_ssize_t i, int64_t start,
|
|
int64_t end, object name, bint na_filter,
|
|
kh_str_starts_t *na_hashset,
|
|
object na_flist, object col_dtype):
|
|
|
|
if col_dtype is not None:
|
|
col_res, na_count = self._convert_with_dtype(
|
|
col_dtype, i, start, end, na_filter,
|
|
1, na_hashset, na_flist)
|
|
|
|
# Fallback on the parse (e.g. we requested int dtype,
|
|
# but its actually a float).
|
|
if col_res is not None:
|
|
return col_res, na_count
|
|
|
|
if i in self.noconvert:
|
|
return self._string_convert(i, start, end, na_filter, na_hashset)
|
|
else:
|
|
col_res = None
|
|
for dt in self.dtype_cast_order:
|
|
try:
|
|
col_res, na_count = self._convert_with_dtype(
|
|
dt, i, start, end, na_filter, 0, na_hashset, na_flist)
|
|
except ValueError:
|
|
# This error is raised from trying to convert to uint64,
|
|
# and we discover that we cannot convert to any numerical
|
|
# dtype successfully. As a result, we leave the data
|
|
# column AS IS with object dtype.
|
|
col_res, na_count = self._convert_with_dtype(
|
|
np.dtype('object'), i, start, end, 0,
|
|
0, na_hashset, na_flist)
|
|
except OverflowError:
|
|
col_res, na_count = self._convert_with_dtype(
|
|
np.dtype('object'), i, start, end, na_filter,
|
|
0, na_hashset, na_flist)
|
|
|
|
if col_res is not None:
|
|
break
|
|
|
|
# we had a fallback parse on the dtype, so now try to cast
|
|
if col_res is not None and col_dtype is not None:
|
|
# If col_res is bool, it might actually be a bool array mixed with NaNs
|
|
# (see _try_bool_flex()). Usually this would be taken care of using
|
|
# _maybe_upcast(), but if col_dtype is a floating type we should just
|
|
# take care of that cast here.
|
|
if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
|
|
mask = col_res.view(np.uint8) == na_values[np.uint8]
|
|
col_res = col_res.astype(col_dtype)
|
|
np.putmask(col_res, mask, np.nan)
|
|
return col_res, na_count
|
|
|
|
# NaNs are already cast to True here, so can not use astype
|
|
if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
|
|
if na_count > 0:
|
|
raise ValueError(
|
|
f"cannot safely convert passed user dtype of "
|
|
f"{col_dtype} for {np.bool_} dtyped data in "
|
|
f"column {i} due to NA values"
|
|
)
|
|
|
|
# only allow safe casts, eg. with a nan you cannot safely cast to int
|
|
try:
|
|
col_res = col_res.astype(col_dtype, casting='safe')
|
|
except TypeError:
|
|
|
|
# float -> int conversions can fail the above
|
|
# even with no nans
|
|
col_res_orig = col_res
|
|
col_res = col_res.astype(col_dtype)
|
|
if (col_res != col_res_orig).any():
|
|
raise ValueError(
|
|
f"cannot safely convert passed user dtype of "
|
|
f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in "
|
|
f"column {i}")
|
|
|
|
return col_res, na_count
|
|
|
|
cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
|
|
int64_t start, int64_t end,
|
|
bint na_filter,
|
|
bint user_dtype,
|
|
kh_str_starts_t *na_hashset,
|
|
object na_flist):
|
|
if isinstance(dtype, CategoricalDtype):
|
|
# TODO: I suspect that _categorical_convert could be
|
|
# optimized when dtype is an instance of CategoricalDtype
|
|
codes, cats, na_count = _categorical_convert(
|
|
self.parser, i, start, end, na_filter, na_hashset)
|
|
|
|
# Method accepts list of strings, not encoded ones.
|
|
true_values = [x.decode() for x in self.true_values]
|
|
array_type = dtype.construct_array_type()
|
|
cat = array_type._from_inferred_categories(
|
|
cats, codes, dtype, true_values=true_values)
|
|
return cat, na_count
|
|
|
|
elif is_extension_array_dtype(dtype):
|
|
result, na_count = self._string_convert(i, start, end, na_filter,
|
|
na_hashset)
|
|
|
|
array_type = dtype.construct_array_type()
|
|
try:
|
|
# use _from_sequence_of_strings if the class defines it
|
|
if is_bool_dtype(dtype):
|
|
true_values = [x.decode() for x in self.true_values]
|
|
false_values = [x.decode() for x in self.false_values]
|
|
result = array_type._from_sequence_of_strings(
|
|
result, dtype=dtype, true_values=true_values,
|
|
false_values=false_values)
|
|
else:
|
|
result = array_type._from_sequence_of_strings(result, dtype=dtype)
|
|
except NotImplementedError:
|
|
raise NotImplementedError(
|
|
f"Extension Array: {array_type} must implement "
|
|
f"_from_sequence_of_strings in order "
|
|
f"to be used in parser methods")
|
|
|
|
return result, na_count
|
|
|
|
elif is_integer_dtype(dtype):
|
|
try:
|
|
result, na_count = _try_int64(self.parser, i, start,
|
|
end, na_filter, na_hashset)
|
|
if user_dtype and na_count is not None:
|
|
if na_count > 0:
|
|
raise ValueError(f"Integer column has NA values in column {i}")
|
|
except OverflowError:
|
|
result = _try_uint64(self.parser, i, start, end,
|
|
na_filter, na_hashset)
|
|
na_count = 0
|
|
|
|
if result is not None and dtype != 'int64':
|
|
result = result.astype(dtype)
|
|
|
|
return result, na_count
|
|
|
|
elif is_float_dtype(dtype):
|
|
result, na_count = _try_double(self.parser, i, start, end,
|
|
na_filter, na_hashset, na_flist)
|
|
|
|
if result is not None and dtype != 'float64':
|
|
result = result.astype(dtype)
|
|
return result, na_count
|
|
elif is_bool_dtype(dtype):
|
|
result, na_count = _try_bool_flex(self.parser, i, start, end,
|
|
na_filter, na_hashset,
|
|
self.true_set, self.false_set)
|
|
if user_dtype and na_count is not None:
|
|
if na_count > 0:
|
|
raise ValueError(f"Bool column has NA values in column {i}")
|
|
return result, na_count
|
|
|
|
elif dtype.kind == 'S':
|
|
# TODO: na handling
|
|
width = dtype.itemsize
|
|
if width > 0:
|
|
result = _to_fw_string(self.parser, i, start, end, width)
|
|
return result, 0
|
|
|
|
# treat as a regular string parsing
|
|
return self._string_convert(i, start, end, na_filter,
|
|
na_hashset)
|
|
elif dtype.kind == 'U':
|
|
width = dtype.itemsize
|
|
if width > 0:
|
|
raise TypeError(f"the dtype {dtype} is not supported for parsing")
|
|
|
|
# unicode variable width
|
|
return self._string_convert(i, start, end, na_filter,
|
|
na_hashset)
|
|
elif is_object_dtype(dtype):
|
|
return self._string_convert(i, start, end, na_filter,
|
|
na_hashset)
|
|
elif is_datetime64_dtype(dtype):
|
|
raise TypeError(f"the dtype {dtype} is not supported "
|
|
f"for parsing, pass this column "
|
|
f"using parse_dates instead")
|
|
else:
|
|
raise TypeError(f"the dtype {dtype} is not supported for parsing")
|
|
|
|
# -> tuple[ndarray[object], int]
|
|
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
|
|
bint na_filter, kh_str_starts_t *na_hashset):
|
|
|
|
return _string_box_utf8(self.parser, i, start, end, na_filter,
|
|
na_hashset, self.encoding_errors)
|
|
|
|
def _get_converter(self, i: int, name):
|
|
if self.converters is None:
|
|
return None
|
|
|
|
if name is not None and name in self.converters:
|
|
return self.converters[name]
|
|
|
|
# Converter for position, if any
|
|
return self.converters.get(i)
|
|
|
|
cdef _get_na_list(self, Py_ssize_t i, name):
|
|
# Note: updates self.na_values, self.na_fvalues
|
|
if self.na_values is None:
|
|
return None, set()
|
|
|
|
if isinstance(self.na_values, dict):
|
|
key = None
|
|
values = None
|
|
|
|
if name is not None and name in self.na_values:
|
|
key = name
|
|
elif i in self.na_values:
|
|
key = i
|
|
else: # No na_values provided for this column.
|
|
if self.keep_default_na:
|
|
return _NA_VALUES, set()
|
|
|
|
return list(), set()
|
|
|
|
values = self.na_values[key]
|
|
if values is not None and not isinstance(values, list):
|
|
values = list(values)
|
|
|
|
fvalues = self.na_fvalues[key]
|
|
if fvalues is not None and not isinstance(fvalues, set):
|
|
fvalues = set(fvalues)
|
|
|
|
return _ensure_encoded(values), fvalues
|
|
else:
|
|
if not isinstance(self.na_values, list):
|
|
self.na_values = list(self.na_values)
|
|
if not isinstance(self.na_fvalues, set):
|
|
self.na_fvalues = set(self.na_fvalues)
|
|
|
|
return _ensure_encoded(self.na_values), self.na_fvalues
|
|
|
|
cdef _free_na_set(self, kh_str_starts_t *table):
|
|
kh_destroy_str_starts(table)
|
|
|
|
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
|
|
cdef int64_t j
|
|
if self.has_usecols and self.names is not None:
|
|
if (not callable(self.usecols) and
|
|
len(self.names) == len(self.usecols)):
|
|
return self.names[nused]
|
|
else:
|
|
return self.names[i - self.leading_cols]
|
|
else:
|
|
if self.header is not None:
|
|
j = i - self.leading_cols
|
|
# generate extra (bogus) headers if there are more columns than headers
|
|
if j >= len(self.header[0]):
|
|
return j
|
|
elif self.has_mi_columns:
|
|
return tuple(header_row[j] for header_row in self.header)
|
|
else:
|
|
return self.header[0][j]
|
|
else:
|
|
return None
|
|
|
|
|
|
# Factor out code common to TextReader.__dealloc__ and TextReader.close
|
|
# It cannot be a class method, since calling self.close() in __dealloc__
|
|
# which causes a class attribute lookup and violates best parctices
|
|
# https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc
|
|
cdef _close(TextReader reader):
|
|
# also preemptively free all allocated memory
|
|
parser_free(reader.parser)
|
|
if reader.true_set:
|
|
kh_destroy_str_starts(reader.true_set)
|
|
reader.true_set = NULL
|
|
if reader.false_set:
|
|
kh_destroy_str_starts(reader.false_set)
|
|
reader.false_set = NULL
|
|
|
|
|
|
cdef:
|
|
object _true_values = [b'True', b'TRUE', b'true']
|
|
object _false_values = [b'False', b'FALSE', b'false']
|
|
|
|
|
|
def _ensure_encoded(list lst):
|
|
cdef:
|
|
list result = []
|
|
for x in lst:
|
|
if isinstance(x, str):
|
|
x = PyUnicode_AsUTF8String(x)
|
|
elif not isinstance(x, bytes):
|
|
x = str(x).encode('utf-8')
|
|
|
|
result.append(x)
|
|
return result
|
|
|
|
|
|
# common NA values
|
|
# no longer excluding inf representations
|
|
# '1.#INF','-1.#INF', '1.#INF000000',
|
|
STR_NA_VALUES = {
|
|
"-1.#IND",
|
|
"1.#QNAN",
|
|
"1.#IND",
|
|
"-1.#QNAN",
|
|
"#N/A N/A",
|
|
"#N/A",
|
|
"N/A",
|
|
"n/a",
|
|
"NA",
|
|
"<NA>",
|
|
"#NA",
|
|
"NULL",
|
|
"null",
|
|
"NaN",
|
|
"-NaN",
|
|
"nan",
|
|
"-nan",
|
|
"",
|
|
}
|
|
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
|
|
|
|
|
|
def _maybe_upcast(arr):
|
|
"""
|
|
|
|
"""
|
|
if issubclass(arr.dtype.type, np.integer):
|
|
na_value = na_values[arr.dtype]
|
|
arr = arr.astype(float)
|
|
np.putmask(arr, arr == na_value, np.nan)
|
|
elif arr.dtype == np.bool_:
|
|
mask = arr.view(np.uint8) == na_values[np.uint8]
|
|
arr = arr.astype(object)
|
|
np.putmask(arr, mask, np.nan)
|
|
|
|
return arr
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Type conversions / inference support code
|
|
|
|
|
|
# -> tuple[ndarray[object], int]
|
|
cdef _string_box_utf8(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
bint na_filter, kh_str_starts_t *na_hashset,
|
|
const char *encoding_errors):
|
|
cdef:
|
|
int error, na_count = 0
|
|
Py_ssize_t i, lines
|
|
coliter_t it
|
|
const char *word = NULL
|
|
ndarray[object] result
|
|
|
|
int ret = 0
|
|
kh_strbox_t *table
|
|
|
|
object pyval
|
|
|
|
object NA = na_values[np.object_]
|
|
khiter_t k
|
|
|
|
table = kh_init_strbox()
|
|
lines = line_end - line_start
|
|
result = np.empty(lines, dtype=np.object_)
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
|
|
if na_filter:
|
|
if kh_get_str_starts_item(na_hashset, word):
|
|
# in the hash table
|
|
na_count += 1
|
|
result[i] = NA
|
|
continue
|
|
|
|
k = kh_get_strbox(table, word)
|
|
|
|
# in the hash table
|
|
if k != table.n_buckets:
|
|
# this increments the refcount, but need to test
|
|
pyval = <object>table.vals[k]
|
|
else:
|
|
# box it. new ref?
|
|
pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
|
|
|
|
k = kh_put_strbox(table, word, &ret)
|
|
table.vals[k] = <PyObject *>pyval
|
|
|
|
result[i] = pyval
|
|
|
|
kh_destroy_strbox(table)
|
|
|
|
return result, na_count
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
cdef _categorical_convert(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
bint na_filter, kh_str_starts_t *na_hashset):
|
|
"Convert column data into codes, categories"
|
|
cdef:
|
|
int na_count = 0
|
|
Py_ssize_t i, size, lines
|
|
coliter_t it
|
|
const char *word = NULL
|
|
|
|
int64_t NA = -1
|
|
int64_t[:] codes
|
|
int64_t current_category = 0
|
|
|
|
char *errors = "strict"
|
|
|
|
int ret = 0
|
|
kh_str_t *table
|
|
khiter_t k
|
|
|
|
lines = line_end - line_start
|
|
codes = np.empty(lines, dtype=np.int64)
|
|
|
|
# factorize parsed values, creating a hash table
|
|
# bytes -> category code
|
|
with nogil:
|
|
table = kh_init_str()
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
|
|
if na_filter:
|
|
if kh_get_str_starts_item(na_hashset, word):
|
|
# is in NA values
|
|
na_count += 1
|
|
codes[i] = NA
|
|
continue
|
|
|
|
k = kh_get_str(table, word)
|
|
# not in the hash table
|
|
if k == table.n_buckets:
|
|
k = kh_put_str(table, word, &ret)
|
|
table.vals[k] = current_category
|
|
current_category += 1
|
|
|
|
codes[i] = table.vals[k]
|
|
|
|
# parse and box categories to python strings
|
|
result = np.empty(table.n_occupied, dtype=np.object_)
|
|
for k in range(table.n_buckets):
|
|
if kh_exist_str(table, k):
|
|
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
|
|
|
|
kh_destroy_str(table)
|
|
return np.asarray(codes), result, na_count
|
|
|
|
|
|
# -> ndarray[f'|S{width}']
|
|
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
|
|
int64_t line_end, int64_t width):
|
|
cdef:
|
|
char *data
|
|
ndarray result
|
|
|
|
result = np.empty(line_end - line_start, dtype=f'|S{width}')
|
|
data = <char*>result.data
|
|
|
|
with nogil:
|
|
_to_fw_string_nogil(parser, col, line_start, line_end, width, data)
|
|
|
|
return result
|
|
|
|
|
|
cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
size_t width, char *data) nogil:
|
|
cdef:
|
|
int64_t i
|
|
coliter_t it
|
|
const char *word = NULL
|
|
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
for i in range(line_end - line_start):
|
|
COLITER_NEXT(it, word)
|
|
strncpy(data, word, width)
|
|
data += width
|
|
|
|
|
|
cdef:
|
|
char* cinf = b'inf'
|
|
char* cposinf = b'+inf'
|
|
char* cneginf = b'-inf'
|
|
|
|
char* cinfty = b'Infinity'
|
|
char* cposinfty = b'+Infinity'
|
|
char* cneginfty = b'-Infinity'
|
|
|
|
|
|
# -> tuple[ndarray[float64_t], int] | tuple[None, None]
|
|
cdef _try_double(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
|
|
cdef:
|
|
int error, na_count = 0
|
|
Py_ssize_t lines
|
|
float64_t *data
|
|
float64_t NA = na_values[np.float64]
|
|
kh_float64_t *na_fset
|
|
ndarray[float64_t] result
|
|
bint use_na_flist = len(na_flist) > 0
|
|
|
|
lines = line_end - line_start
|
|
result = np.empty(lines, dtype=np.float64)
|
|
data = <float64_t *>result.data
|
|
na_fset = kset_float64_from_list(na_flist)
|
|
with nogil:
|
|
error = _try_double_nogil(parser, parser.double_converter,
|
|
col, line_start, line_end,
|
|
na_filter, na_hashset, use_na_flist,
|
|
na_fset, NA, data, &na_count)
|
|
|
|
kh_destroy_float64(na_fset)
|
|
if error != 0:
|
|
return None, None
|
|
return result, na_count
|
|
|
|
|
|
cdef inline int _try_double_nogil(parser_t *parser,
|
|
float64_t (*double_converter)(
|
|
const char *, char **, char,
|
|
char, char, int, int *, int *) nogil,
|
|
int64_t col, int64_t line_start, int64_t line_end,
|
|
bint na_filter, kh_str_starts_t *na_hashset,
|
|
bint use_na_flist,
|
|
const kh_float64_t *na_flist,
|
|
float64_t NA, float64_t *data,
|
|
int *na_count) nogil:
|
|
cdef:
|
|
int error = 0,
|
|
Py_ssize_t i, lines = line_end - line_start
|
|
coliter_t it
|
|
const char *word = NULL
|
|
char *p_end
|
|
khiter_t k64
|
|
|
|
na_count[0] = 0
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
if na_filter:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
|
|
if kh_get_str_starts_item(na_hashset, word):
|
|
# in the hash table
|
|
na_count[0] += 1
|
|
data[0] = NA
|
|
else:
|
|
data[0] = double_converter(word, &p_end, parser.decimal,
|
|
parser.sci, parser.thousands,
|
|
1, &error, NULL)
|
|
if error != 0 or p_end == word or p_end[0]:
|
|
error = 0
|
|
if (strcasecmp(word, cinf) == 0 or
|
|
strcasecmp(word, cposinf) == 0 or
|
|
strcasecmp(word, cinfty) == 0 or
|
|
strcasecmp(word, cposinfty) == 0):
|
|
data[0] = INF
|
|
elif (strcasecmp(word, cneginf) == 0 or
|
|
strcasecmp(word, cneginfty) == 0):
|
|
data[0] = NEGINF
|
|
else:
|
|
return 1
|
|
if use_na_flist:
|
|
k64 = kh_get_float64(na_flist, data[0])
|
|
if k64 != na_flist.n_buckets:
|
|
na_count[0] += 1
|
|
data[0] = NA
|
|
data += 1
|
|
else:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
data[0] = double_converter(word, &p_end, parser.decimal,
|
|
parser.sci, parser.thousands,
|
|
1, &error, NULL)
|
|
if error != 0 or p_end == word or p_end[0]:
|
|
error = 0
|
|
if (strcasecmp(word, cinf) == 0 or
|
|
strcasecmp(word, cposinf) == 0 or
|
|
strcasecmp(word, cinfty) == 0 or
|
|
strcasecmp(word, cposinfty) == 0):
|
|
data[0] = INF
|
|
elif (strcasecmp(word, cneginf) == 0 or
|
|
strcasecmp(word, cneginfty) == 0):
|
|
data[0] = NEGINF
|
|
else:
|
|
return 1
|
|
data += 1
|
|
|
|
return 0
|
|
|
|
|
|
cdef _try_uint64(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
bint na_filter, kh_str_starts_t *na_hashset):
|
|
cdef:
|
|
int error
|
|
Py_ssize_t lines
|
|
coliter_t it
|
|
uint64_t *data
|
|
ndarray result
|
|
uint_state state
|
|
|
|
lines = line_end - line_start
|
|
result = np.empty(lines, dtype=np.uint64)
|
|
data = <uint64_t *>result.data
|
|
|
|
uint_state_init(&state)
|
|
coliter_setup(&it, parser, col, line_start)
|
|
with nogil:
|
|
error = _try_uint64_nogil(parser, col, line_start, line_end,
|
|
na_filter, na_hashset, data, &state)
|
|
if error != 0:
|
|
if error == ERROR_OVERFLOW:
|
|
# Can't get the word variable
|
|
raise OverflowError('Overflow')
|
|
return None
|
|
|
|
if uint64_conflict(&state):
|
|
raise ValueError('Cannot convert to numerical dtype')
|
|
|
|
if state.seen_sint:
|
|
raise OverflowError('Overflow')
|
|
|
|
return result
|
|
|
|
|
|
cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
|
|
int64_t line_start,
|
|
int64_t line_end, bint na_filter,
|
|
const kh_str_starts_t *na_hashset,
|
|
uint64_t *data, uint_state *state) nogil:
|
|
cdef:
|
|
int error
|
|
Py_ssize_t i, lines = line_end - line_start
|
|
coliter_t it
|
|
const char *word = NULL
|
|
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
if na_filter:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
if kh_get_str_starts_item(na_hashset, word):
|
|
# in the hash table
|
|
state.seen_null = 1
|
|
data[i] = 0
|
|
continue
|
|
|
|
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
|
|
&error, parser.thousands)
|
|
if error != 0:
|
|
return error
|
|
else:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
|
|
&error, parser.thousands)
|
|
if error != 0:
|
|
return error
|
|
|
|
return 0
|
|
|
|
|
|
cdef _try_int64(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
bint na_filter, kh_str_starts_t *na_hashset):
|
|
cdef:
|
|
int error, na_count = 0
|
|
Py_ssize_t lines
|
|
coliter_t it
|
|
int64_t *data
|
|
ndarray result
|
|
int64_t NA = na_values[np.int64]
|
|
|
|
lines = line_end - line_start
|
|
result = np.empty(lines, dtype=np.int64)
|
|
data = <int64_t *>result.data
|
|
coliter_setup(&it, parser, col, line_start)
|
|
with nogil:
|
|
error = _try_int64_nogil(parser, col, line_start, line_end,
|
|
na_filter, na_hashset, NA, data, &na_count)
|
|
if error != 0:
|
|
if error == ERROR_OVERFLOW:
|
|
# Can't get the word variable
|
|
raise OverflowError('Overflow')
|
|
return None, None
|
|
|
|
return result, na_count
|
|
|
|
|
|
cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
|
|
int64_t line_start,
|
|
int64_t line_end, bint na_filter,
|
|
const kh_str_starts_t *na_hashset, int64_t NA,
|
|
int64_t *data, int *na_count) nogil:
|
|
cdef:
|
|
int error
|
|
Py_ssize_t i, lines = line_end - line_start
|
|
coliter_t it
|
|
const char *word = NULL
|
|
|
|
na_count[0] = 0
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
if na_filter:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
if kh_get_str_starts_item(na_hashset, word):
|
|
# in the hash table
|
|
na_count[0] += 1
|
|
data[i] = NA
|
|
continue
|
|
|
|
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
|
|
&error, parser.thousands)
|
|
if error != 0:
|
|
return error
|
|
else:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
|
|
&error, parser.thousands)
|
|
if error != 0:
|
|
return error
|
|
|
|
return 0
|
|
|
|
|
|
# -> tuple[ndarray[bool], int]
|
|
cdef _try_bool_flex(parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end,
|
|
bint na_filter, const kh_str_starts_t *na_hashset,
|
|
const kh_str_starts_t *true_hashset,
|
|
const kh_str_starts_t *false_hashset):
|
|
cdef:
|
|
int error, na_count = 0
|
|
Py_ssize_t lines
|
|
uint8_t *data
|
|
ndarray result
|
|
uint8_t NA = na_values[np.bool_]
|
|
|
|
lines = line_end - line_start
|
|
result = np.empty(lines, dtype=np.uint8)
|
|
data = <uint8_t *>result.data
|
|
with nogil:
|
|
error = _try_bool_flex_nogil(parser, col, line_start, line_end,
|
|
na_filter, na_hashset, true_hashset,
|
|
false_hashset, NA, data, &na_count)
|
|
if error != 0:
|
|
return None, None
|
|
return result.view(np.bool_), na_count
|
|
|
|
|
|
cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
|
|
int64_t line_start,
|
|
int64_t line_end, bint na_filter,
|
|
const kh_str_starts_t *na_hashset,
|
|
const kh_str_starts_t *true_hashset,
|
|
const kh_str_starts_t *false_hashset,
|
|
uint8_t NA, uint8_t *data,
|
|
int *na_count) nogil:
|
|
cdef:
|
|
int error = 0
|
|
Py_ssize_t i, lines = line_end - line_start
|
|
coliter_t it
|
|
const char *word = NULL
|
|
|
|
na_count[0] = 0
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
if na_filter:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
|
|
if kh_get_str_starts_item(na_hashset, word):
|
|
# in the hash table
|
|
na_count[0] += 1
|
|
data[0] = NA
|
|
data += 1
|
|
continue
|
|
|
|
if kh_get_str_starts_item(true_hashset, word):
|
|
data[0] = 1
|
|
data += 1
|
|
continue
|
|
if kh_get_str_starts_item(false_hashset, word):
|
|
data[0] = 0
|
|
data += 1
|
|
continue
|
|
|
|
error = to_boolean(word, data)
|
|
if error != 0:
|
|
return error
|
|
data += 1
|
|
else:
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
|
|
if kh_get_str_starts_item(true_hashset, word):
|
|
data[0] = 1
|
|
data += 1
|
|
continue
|
|
|
|
if kh_get_str_starts_item(false_hashset, word):
|
|
data[0] = 0
|
|
data += 1
|
|
continue
|
|
|
|
error = to_boolean(word, data)
|
|
if error != 0:
|
|
return error
|
|
data += 1
|
|
|
|
return 0
|
|
|
|
|
|
cdef kh_str_starts_t* kset_from_list(list values) except NULL:
|
|
# caller takes responsibility for freeing the hash table
|
|
cdef:
|
|
Py_ssize_t i
|
|
kh_str_starts_t *table
|
|
int ret = 0
|
|
object val
|
|
|
|
table = kh_init_str_starts()
|
|
|
|
for i in range(len(values)):
|
|
val = values[i]
|
|
|
|
# None creeps in sometimes, which isn't possible here
|
|
if not isinstance(val, bytes):
|
|
kh_destroy_str_starts(table)
|
|
raise ValueError('Must be all encoded bytes')
|
|
|
|
kh_put_str_starts_item(table, PyBytes_AsString(val), &ret)
|
|
|
|
if table.table.n_buckets <= 128:
|
|
# Resize the hash table to make it almost empty, this
|
|
# reduces amount of hash collisions on lookup thus
|
|
# "key not in table" case is faster.
|
|
# Note that this trades table memory footprint for lookup speed.
|
|
kh_resize_str_starts(table, table.table.n_buckets * 8)
|
|
|
|
return table
|
|
|
|
|
|
cdef kh_float64_t* kset_float64_from_list(values) except NULL:
|
|
# caller takes responsibility for freeing the hash table
|
|
cdef:
|
|
khiter_t k
|
|
kh_float64_t *table
|
|
int ret = 0
|
|
float64_t val
|
|
object value
|
|
|
|
table = kh_init_float64()
|
|
|
|
for value in values:
|
|
val = float(value)
|
|
|
|
k = kh_put_float64(table, val, &ret)
|
|
|
|
if table.n_buckets <= 128:
|
|
# See reasoning in kset_from_list
|
|
kh_resize_float64(table, table.n_buckets * 8)
|
|
return table
|
|
|
|
|
|
cdef raise_parser_error(object base, parser_t *parser):
|
|
cdef:
|
|
object old_exc
|
|
object exc_type
|
|
PyObject *type
|
|
PyObject *value
|
|
PyObject *traceback
|
|
|
|
if PyErr_Occurred():
|
|
PyErr_Fetch(&type, &value, &traceback)
|
|
Py_XDECREF(traceback)
|
|
|
|
if value != NULL:
|
|
old_exc = <object>value
|
|
Py_XDECREF(value)
|
|
|
|
# PyErr_Fetch only returned the error message in *value,
|
|
# so the Exception class must be extracted from *type.
|
|
if isinstance(old_exc, str):
|
|
if type != NULL:
|
|
exc_type = <object>type
|
|
else:
|
|
exc_type = ParserError
|
|
|
|
Py_XDECREF(type)
|
|
raise exc_type(old_exc)
|
|
else:
|
|
Py_XDECREF(type)
|
|
raise old_exc
|
|
|
|
message = f'{base}. C error: '
|
|
if parser.error_msg != NULL:
|
|
message += parser.error_msg.decode('utf-8')
|
|
else:
|
|
message += 'no error message set'
|
|
|
|
raise ParserError(message)
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# NA values
|
|
def _compute_na_values():
|
|
int64info = np.iinfo(np.int64)
|
|
int32info = np.iinfo(np.int32)
|
|
int16info = np.iinfo(np.int16)
|
|
int8info = np.iinfo(np.int8)
|
|
uint64info = np.iinfo(np.uint64)
|
|
uint32info = np.iinfo(np.uint32)
|
|
uint16info = np.iinfo(np.uint16)
|
|
uint8info = np.iinfo(np.uint8)
|
|
na_values = {
|
|
np.float64: np.nan,
|
|
np.int64: int64info.min,
|
|
np.int32: int32info.min,
|
|
np.int16: int16info.min,
|
|
np.int8: int8info.min,
|
|
np.uint64: uint64info.max,
|
|
np.uint32: uint32info.max,
|
|
np.uint16: uint16info.max,
|
|
np.uint8: uint8info.max,
|
|
np.bool_: uint8info.max,
|
|
np.object_: np.nan # oof
|
|
}
|
|
return na_values
|
|
|
|
|
|
na_values = _compute_na_values()
|
|
|
|
for k in list(na_values):
|
|
na_values[np.dtype(k)] = na_values[k]
|
|
|
|
|
|
# -> ArrayLike
|
|
cdef _apply_converter(object f, parser_t *parser, int64_t col,
|
|
int64_t line_start, int64_t line_end):
|
|
cdef:
|
|
Py_ssize_t i, lines
|
|
coliter_t it
|
|
const char *word = NULL
|
|
ndarray[object] result
|
|
object val
|
|
|
|
lines = line_end - line_start
|
|
result = np.empty(lines, dtype=np.object_)
|
|
|
|
coliter_setup(&it, parser, col, line_start)
|
|
|
|
for i in range(lines):
|
|
COLITER_NEXT(it, word)
|
|
val = PyUnicode_FromString(word)
|
|
result[i] = f(val)
|
|
|
|
return lib.maybe_convert_objects(result)
|
|
|
|
|
|
cdef list _maybe_encode(list values):
|
|
if values is None:
|
|
return []
|
|
return [x.encode('utf-8') if isinstance(x, str) else x for x in values]
|
|
|
|
|
|
def sanitize_objects(ndarray[object] values, set na_values) -> int:
|
|
"""
|
|
Convert specified values, including the given set na_values to np.nan.
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
na_values : set
|
|
|
|
Returns
|
|
-------
|
|
na_count : int
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n
|
|
object val, onan
|
|
Py_ssize_t na_count = 0
|
|
dict memo = {}
|
|
|
|
n = len(values)
|
|
onan = np.nan
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
if val in na_values:
|
|
values[i] = onan
|
|
na_count += 1
|
|
elif val in memo:
|
|
values[i] = memo[val]
|
|
else:
|
|
memo[val] = val
|
|
|
|
return na_count
|