from collections import defaultdict import cython from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np cimport numpy as cnp from numpy cimport ( NPY_INTP, int64_t, intp_t, ndarray, ) cnp.import_array() from pandas._libs.algos import ensure_int64 from pandas._libs.arrays cimport NDArrayBacked from pandas._libs.util cimport ( is_array, is_integer_object, ) @cython.final @cython.freelist(32) cdef class BlockPlacement: # __slots__ = '_as_slice', '_as_array', '_len' cdef: slice _as_slice ndarray _as_array # Note: this still allows `None`; will be intp_t bint _has_slice, _has_array, _is_known_slice_like def __cinit__(self, val): cdef: slice slc self._as_slice = None self._as_array = None self._has_slice = False self._has_array = False if is_integer_object(val): slc = slice(val, val + 1, 1) self._as_slice = slc self._has_slice = True elif isinstance(val, slice): slc = slice_canonize(val) if slc.start != slc.stop: self._as_slice = slc self._has_slice = True else: arr = np.empty(0, dtype=np.intp) self._as_array = arr self._has_array = True else: # Cython memoryview interface requires ndarray to be writeable. if ( not is_array(val) or not cnp.PyArray_ISWRITEABLE(val) or (val).descr.type_num != cnp.NPY_INTP ): arr = np.require(val, dtype=np.intp, requirements='W') else: arr = val # Caller is responsible for ensuring arr.ndim == 1 self._as_array = arr self._has_array = True def __str__(self) -> str: cdef: slice s = self._ensure_has_slice() if s is not None: v = self._as_slice else: v = self._as_array return f"{type(self).__name__}({v})" def __repr__(self) -> str: return str(self) def __len__(self) -> int: cdef: slice s = self._ensure_has_slice() if s is not None: return slice_len(s) else: return len(self._as_array) def __iter__(self): cdef: slice s = self._ensure_has_slice() Py_ssize_t start, stop, step, _ if s is not None: start, stop, step, _ = slice_get_indices_ex(s) return iter(range(start, stop, step)) else: return iter(self._as_array) @property def as_slice(self) -> slice: cdef: slice s = self._ensure_has_slice() if s is not None: return s else: raise TypeError("Not slice-like") @property def indexer(self): cdef: slice s = self._ensure_has_slice() if s is not None: return s else: return self._as_array @property def as_array(self) -> np.ndarray: cdef: Py_ssize_t start, stop, end, _ if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) # NOTE: this is the C-optimized equivalent of # `np.arange(start, stop, step, dtype=np.intp)` self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP) self._has_array = True return self._as_array @property def is_slice_like(self) -> bool: cdef: slice s = self._ensure_has_slice() return s is not None def __getitem__(self, loc): cdef: slice s = self._ensure_has_slice() if s is not None: val = slice_getitem(s, loc) else: val = self._as_array[loc] if not isinstance(val, slice) and val.ndim == 0: return val return BlockPlacement(val) def delete(self, loc) -> BlockPlacement: return BlockPlacement(np.delete(self.as_array, loc, axis=0)) def append(self, others) -> BlockPlacement: if not len(others): return self return BlockPlacement( np.concatenate([self.as_array] + [o.as_array for o in others]) ) cdef BlockPlacement iadd(self, other): cdef: slice s = self._ensure_has_slice() Py_ssize_t other_int, start, stop, step if is_integer_object(other) and s is not None: other_int = other if other_int == 0: # BlockPlacement is treated as immutable return self start, stop, step, _ = slice_get_indices_ex(s) start += other_int stop += other_int if (step > 0 and start < 0) or (step < 0 and stop < step): raise ValueError("iadd causes length change") if stop < 0: val = slice(start, None, step) else: val = slice(start, stop, step) return BlockPlacement(val) else: newarr = self.as_array + other if (newarr < 0).any(): raise ValueError("iadd causes length change") val = newarr return BlockPlacement(val) def add(self, other) -> BlockPlacement: # We can get here with int or ndarray return self.iadd(other) cdef slice _ensure_has_slice(self): if not self._has_slice: self._as_slice = indexer_as_slice(self._as_array) self._has_slice = True return self._as_slice cpdef BlockPlacement increment_above(self, Py_ssize_t loc): """ Increment any entries of 'loc' or above by one. """ cdef: slice nv, s = self._ensure_has_slice() Py_ssize_t other_int, start, stop, step ndarray[intp_t, ndim=1] newarr if s is not None: # see if we are either all-above or all-below, each of which # have fastpaths available. start, stop, step, _ = slice_get_indices_ex(s) if start < loc and stop <= loc: # We are entirely below, nothing to increment return self if start >= loc and stop >= loc: # We are entirely above, we can efficiently increment out slice nv = slice(start + 1, stop + 1, step) return BlockPlacement(nv) if loc == 0: # fastpath where we know everything is >= 0 newarr = self.as_array + 1 return BlockPlacement(newarr) newarr = self.as_array.copy() newarr[newarr >= loc] += 1 return BlockPlacement(newarr) def tile_for_unstack(self, factor: int) -> np.ndarray: """ Find the new mgr_locs for the un-stacked version of a Block. """ cdef: slice slc = self._ensure_has_slice() slice new_slice ndarray[intp_t, ndim=1] new_placement if slc is not None and slc.step == 1: new_slc = slice(slc.start * factor, slc.stop * factor, 1) # equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp) new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP) else: # Note: test_pivot_table_empty_aggfunc gets here with `slc is not None` mapped = [ # equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp) cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP) for x in self ] new_placement = np.concatenate(mapped) return new_placement cdef slice slice_canonize(slice s): """ Convert slice to canonical bounded form. """ cdef: Py_ssize_t start = 0, stop = 0, step = 1 if s.step is None: step = 1 else: step = s.step if step == 0: raise ValueError("slice step cannot be zero") if step > 0: if s.stop is None: raise ValueError("unbounded slice") stop = s.stop if s.start is None: start = 0 else: start = s.start if start > stop: start = stop elif step < 0: if s.start is None: raise ValueError("unbounded slice") start = s.start if s.stop is None: stop = -1 else: stop = s.stop if stop > start: stop = start if start < 0 or (stop < 0 and s.stop is not None and step > 0): raise ValueError("unbounded slice") if stop < 0: return slice(start, None, step) else: return slice(start, stop, step) cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: """ Get length of a bounded slice. The slice must not have any "open" bounds that would create dependency on container size, i.e.: - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None`` - if ``s.step < 0``, ``s.start`` is not ``None`` Otherwise, the result is unreliable. """ cdef: Py_ssize_t start, stop, step, length if slc is None: raise TypeError("slc must be slice") # pragma: no cover PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return length cdef (Py_ssize_t, Py_ssize_t, Py_ssize_t, Py_ssize_t) slice_get_indices_ex( slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX ): """ Get (start, stop, step, length) tuple for a slice. If `objlen` is not specified, slice must be bounded, otherwise the result will be wrong. """ cdef: Py_ssize_t start, stop, step, length if slc is None: raise TypeError("slc should be a slice") # pragma: no cover PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return start, stop, step, length cdef slice_getitem(slice slc, ind): cdef: Py_ssize_t s_start, s_stop, s_step, s_len Py_ssize_t ind_start, ind_stop, ind_step, ind_len s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc) if isinstance(ind, slice): ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, s_len) if ind_step > 0 and ind_len == s_len: # short-cut for no-op slice if ind_len == s_len: return slc if ind_step < 0: s_start = s_stop - s_step ind_step = -ind_step s_step *= ind_step s_stop = s_start + ind_stop * s_step s_start = s_start + ind_start * s_step if s_step < 0 and s_stop < 0: return slice(s_start, None, s_step) else: return slice(s_start, s_stop, s_step) else: # NOTE: # this is the C-optimized equivalent of # `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]` return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind] @cython.boundscheck(False) @cython.wraparound(False) cdef slice indexer_as_slice(intp_t[:] vals): cdef: Py_ssize_t i, n, start, stop int64_t d if vals is None: raise TypeError("vals must be ndarray") # pragma: no cover n = vals.shape[0] if n == 0 or vals[0] < 0: return None if n == 1: return slice(vals[0], vals[0] + 1, 1) if vals[1] < 0: return None # n > 2 d = vals[1] - vals[0] if d == 0: return None for i in range(2, n): if vals[i] < 0 or vals[i] - vals[i - 1] != d: return None start = vals[0] stop = start + n * d if stop < 0 and d < 0: return slice(start, None, d) else: return slice(start, stop, d) @cython.boundscheck(False) @cython.wraparound(False) def get_blkno_indexers( int64_t[:] blknos, bint group=True ) -> list[tuple[int, slice | np.ndarray]]: """ Enumerate contiguous runs of integers in ndarray. Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))`` pairs for each contiguous run found. If `group` is True and there is more than one run for a certain blkno, ``(blkno, array)`` with an array containing positions of all elements equal to blkno. Returns ------- list[tuple[int, slice | np.ndarray]] """ # There's blkno in this function's name because it's used in block & # blockno handling. cdef: int64_t cur_blkno Py_ssize_t i, start, stop, n, diff cnp.npy_intp tot_len int64_t blkno object group_dict = defaultdict(list) ndarray[int64_t, ndim=1] arr n = blknos.shape[0] result = list() start = 0 cur_blkno = blknos[start] if n == 0: pass elif group is False: for i in range(1, n): if blknos[i] != cur_blkno: result.append((cur_blkno, slice(start, i))) start = i cur_blkno = blknos[i] result.append((cur_blkno, slice(start, n))) else: for i in range(1, n): if blknos[i] != cur_blkno: group_dict[cur_blkno].append((start, i)) start = i cur_blkno = blknos[i] group_dict[cur_blkno].append((start, n)) for blkno, slices in group_dict.items(): if len(slices) == 1: result.append((blkno, slice(slices[0][0], slices[0][1]))) else: tot_len = sum(stop - start for start, stop in slices) # equiv np.empty(tot_len, dtype=np.int64) arr = cnp.PyArray_EMPTY(1, &tot_len, cnp.NPY_INT64, 0) i = 0 for start, stop in slices: for diff in range(start, stop): arr[i] = diff i += 1 result.append((blkno, arr)) return result def get_blkno_placements(blknos, group: bool = True): """ Parameters ---------- blknos : np.ndarray[int64] group : bool, default True Returns ------- iterator yield (blkno, BlockPlacement) """ blknos = ensure_int64(blknos) for blkno, indexer in get_blkno_indexers(blknos, group): yield blkno, BlockPlacement(indexer) @cython.boundscheck(False) @cython.wraparound(False) cpdef update_blklocs_and_blknos( ndarray[intp_t, ndim=1] blklocs, ndarray[intp_t, ndim=1] blknos, Py_ssize_t loc, intp_t nblocks, ): """ Update blklocs and blknos when a new column is inserted at 'loc'. """ cdef: Py_ssize_t i cnp.npy_intp length = len(blklocs) + 1 ndarray[intp_t, ndim=1] new_blklocs, new_blknos # equiv: new_blklocs = np.empty(length, dtype=np.intp) new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) for i in range(loc): new_blklocs[i] = blklocs[i] new_blknos[i] = blknos[i] new_blklocs[loc] = 0 new_blknos[loc] = nblocks for i in range(loc, length - 1): new_blklocs[i + 1] = blklocs[i] new_blknos[i + 1] = blknos[i] return new_blklocs, new_blknos def _unpickle_block(values, placement, ndim): # We have to do some gymnastics b/c "ndim" is keyword-only from pandas.core.internals.blocks import new_block return new_block(values, placement, ndim=ndim) @cython.freelist(64) cdef class SharedBlock: """ Defining __init__ in a cython class significantly improves performance. """ cdef: public BlockPlacement _mgr_locs readonly int ndim def __cinit__(self, values, placement: BlockPlacement, ndim: int): """ Parameters ---------- values : np.ndarray or ExtensionArray We assume maybe_coerce_values has already been called. placement : BlockPlacement ndim : int 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame """ self._mgr_locs = placement self.ndim = ndim cpdef __reduce__(self): args = (self.values, self.mgr_locs.indexer, self.ndim) return _unpickle_block, args cpdef __setstate__(self, state): from pandas.core.construction import extract_array self.mgr_locs = BlockPlacement(state[0]) self.values = extract_array(state[1], extract_numpy=True) if len(state) > 2: # we stored ndim self.ndim = state[2] else: # older pickle from pandas.core.internals.api import maybe_infer_ndim ndim = maybe_infer_ndim(self.values, self.mgr_locs) self.ndim = ndim cdef class NumpyBlock(SharedBlock): cdef: public ndarray values def __cinit__(self, ndarray values, BlockPlacement placement, int ndim): # set values here the (implicit) call to SharedBlock.__cinit__ will # set placement and ndim self.values = values cpdef NumpyBlock getitem_block_index(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. Assumes self.ndim == 2 """ new_values = self.values[..., slicer] return type(self)(new_values, self._mgr_locs, ndim=self.ndim) cdef class NDArrayBackedBlock(SharedBlock): """ Block backed by NDArrayBackedExtensionArray """ cdef public: NDArrayBacked values def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim): # set values here the (implicit) call to SharedBlock.__cinit__ will # set placement and ndim self.values = values cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. Assumes self.ndim == 2 """ new_values = self.values[..., slicer] return type(self)(new_values, self._mgr_locs, ndim=self.ndim) cdef class Block(SharedBlock): cdef: public object values def __cinit__(self, object values, BlockPlacement placement, int ndim): # set values here the (implicit) call to SharedBlock.__cinit__ will # set placement and ndim self.values = values @cython.freelist(64) cdef class BlockManager: cdef: public tuple blocks public list axes public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs def __cinit__(self, blocks=None, axes=None, verify_integrity=True): # None as defaults for unpickling GH#42345 if blocks is None: # This adds 1-2 microseconds to DataFrame(np.array([])) return if isinstance(blocks, list): # Backward compat for e.g. pyarrow blocks = tuple(blocks) self.blocks = blocks self.axes = axes.copy() # copy to make sure we are not remotely-mutable # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False self._is_consolidated = False self._blknos = None self._blklocs = None # ------------------------------------------------------------------- # Block Placement def _rebuild_blknos_and_blklocs(self) -> None: """ Update mgr._blknos / mgr._blklocs. """ cdef: intp_t blkno, i, j cnp.npy_intp length = self.shape[0] SharedBlock blk BlockPlacement bp ndarray[intp_t, ndim=1] new_blknos, new_blklocs # equiv: np.empty(length, dtype=np.intp) new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) # equiv: new_blknos.fill(-1) cnp.PyArray_FILLWBYTE(new_blknos, -1) cnp.PyArray_FILLWBYTE(new_blklocs, -1) for blkno, blk in enumerate(self.blocks): bp = blk._mgr_locs # Iterating over `bp` is a faster equivalent to # new_blknos[bp.indexer] = blkno # new_blklocs[bp.indexer] = np.arange(len(bp)) for i, j in enumerate(bp): new_blknos[j] = blkno new_blklocs[j] = i for i in range(length): # faster than `for blkno in new_blknos` # https://github.com/cython/cython/issues/4393 blkno = new_blknos[i] # If there are any -1s remaining, this indicates that our mgr_locs # are invalid. if blkno == -1: raise AssertionError("Gaps in blk ref_locs") self._blknos = new_blknos self._blklocs = new_blklocs # ------------------------------------------------------------------- # Pickle cpdef __reduce__(self): if len(self.axes) == 1: # SingleBlockManager, __init__ expects Block, axis args = (self.blocks[0], self.axes[0]) else: args = (self.blocks, self.axes) return type(self), args cpdef __setstate__(self, state): from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( ensure_block_shape, new_block, ) from pandas.core.internals.managers import ensure_index if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: state = state[3]["0.14.1"] axes = [ensure_index(ax) for ax in state["axes"]] ndim = len(axes) for blk in state["blocks"]: vals = blk["values"] # older versions may hold e.g. DatetimeIndex instead of DTA vals = extract_array(vals, extract_numpy=True) blk["values"] = ensure_block_shape(vals, ndim=ndim) nbs = [ new_block(blk["values"], blk["mgr_locs"], ndim=ndim) for blk in state["blocks"] ] blocks = tuple(nbs) self.blocks = blocks self.axes = axes else: # pragma: no cover raise NotImplementedError("pre-0.14.1 pickles are no longer supported") self._post_setstate() def _post_setstate(self) -> None: self._is_consolidated = False self._known_consolidated = False self._rebuild_blknos_and_blklocs() # ------------------------------------------------------------------- # Indexing cdef BlockManager _get_index_slice(self, slobj): cdef: SharedBlock blk, nb BlockManager mgr ndarray blknos, blklocs nbs = [] for blk in self.blocks: nb = blk.getitem_block_index(slobj) nbs.append(nb) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos blklocs = self._blklocs blknos = self._blknos if blknos is not None: mgr._blknos = blknos.copy() mgr._blklocs = blklocs.copy() return mgr def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: return self._get_index_slice(slobj) else: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)