import cython
from cython import Py_ssize_t

from libc.math cimport (
    fabs,
    sqrt,
)
from libc.stdlib cimport (
    free,
    malloc,
)
from libc.string cimport memmove

import numpy as np

cimport numpy as cnp
from numpy cimport (
    NPY_COMPLEX64,
    NPY_COMPLEX128,
    NPY_FLOAT32,
    NPY_FLOAT64,
    NPY_INT8,
    NPY_INT16,
    NPY_INT32,
    NPY_INT64,
    NPY_OBJECT,
    NPY_UINT8,
    NPY_UINT16,
    NPY_UINT32,
    NPY_UINT64,
    float32_t,
    float64_t,
    int8_t,
    int16_t,
    int32_t,
    int64_t,
    intp_t,
    ndarray,
    uint8_t,
    uint16_t,
    uint32_t,
    uint64_t,
)

cnp.import_array()

cimport pandas._libs.util as util
from pandas._libs.dtypes cimport (
    iu_64_floating_obj_t,
    numeric_object_t,
    numeric_t,
)
from pandas._libs.khash cimport (
    kh_destroy_int64,
    kh_get_int64,
    kh_init_int64,
    kh_int64_t,
    kh_put_int64,
    kh_resize_int64,
    khiter_t,
)
from pandas._libs.util cimport get_nat

import pandas._libs.missing as missing

cdef:
    float64_t FP_ERR = 1e-13
    float64_t NaN = <float64_t>np.NaN
    int64_t NPY_NAT = get_nat()


tiebreakers = {
    "average": TIEBREAK_AVERAGE,
    "min": TIEBREAK_MIN,
    "max": TIEBREAK_MAX,
    "first": TIEBREAK_FIRST,
    "dense": TIEBREAK_DENSE,
}


cdef inline bint are_diff(object left, object right):
    try:
        return fabs(left - right) > FP_ERR
    except TypeError:
        return left != right


class Infinity:
    """
    Provide a positive Infinity comparison method for ranking.
    """
    __lt__ = lambda self, other: False
    __le__ = lambda self, other: isinstance(other, Infinity)
    __eq__ = lambda self, other: isinstance(other, Infinity)
    __ne__ = lambda self, other: not isinstance(other, Infinity)
    __gt__ = lambda self, other: (not isinstance(other, Infinity) and
                                  not missing.checknull(other))
    __ge__ = lambda self, other: not missing.checknull(other)


class NegInfinity:
    """
    Provide a negative Infinity comparison method for ranking.
    """
    __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
                                  not missing.checknull(other))
    __le__ = lambda self, other: not missing.checknull(other)
    __eq__ = lambda self, other: isinstance(other, NegInfinity)
    __ne__ = lambda self, other: not isinstance(other, NegInfinity)
    __gt__ = lambda self, other: False
    __ge__ = lambda self, other: isinstance(other, NegInfinity)


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
    """
    Efficiently find the unique first-differences of the given array.

    Parameters
    ----------
    arr : ndarray[int64_t]

    Returns
    -------
    ndarray[int64_t]
        An ordered ndarray[int64_t]
    """
    cdef:
        Py_ssize_t i, n = len(arr)
        int64_t val
        khiter_t k
        kh_int64_t *table
        int ret = 0
        list uniques = []
        ndarray[int64_t, ndim=1] result

    table = kh_init_int64()
    kh_resize_int64(table, 10)
    for i in range(n - 1):
        val = arr[i + 1] - arr[i]
        k = kh_get_int64(table, val)
        if k == table.n_buckets:
            kh_put_int64(table, val, &ret)
            uniques.append(val)
    kh_destroy_int64(table)

    result = np.array(uniques, dtype=np.int64)
    result.sort()
    return result


@cython.wraparound(False)
@cython.boundscheck(False)
def is_lexsorted(list_of_arrays: list) -> bint:
    cdef:
        Py_ssize_t i
        Py_ssize_t n, nlevels
        int64_t k, cur, pre
        ndarray arr
        bint result = True

    nlevels = len(list_of_arrays)
    n = len(list_of_arrays[0])

    cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
    for i in range(nlevels):
        arr = list_of_arrays[i]
        assert arr.dtype.name == 'int64'
        vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)

    # Assume uniqueness??
    with nogil:
        for i in range(1, n):
            for k in range(nlevels):
                cur = vecs[k][i]
                pre = vecs[k][i -1]
                if cur == pre:
                    continue
                elif cur > pre:
                    break
                else:
                    result = False
                    break
    free(vecs)
    return result


@cython.boundscheck(False)
@cython.wraparound(False)
def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
    """
    Compute a 1-d indexer.

    The indexer is an ordering of the passed index,
    ordered by the groups.

    Parameters
    ----------
    index: np.ndarray[np.intp]
        Mappings from group -> position.
    ngroups: int64
        Number of groups.

    Returns
    -------
    ndarray[intp_t, ndim=1]
        Indexer
    ndarray[intp_t, ndim=1]
        Group Counts

    Notes
    -----
    This is a reverse of the label factorization process.
    """
    cdef:
        Py_ssize_t i, label, n
        intp_t[::1] indexer, where, counts

    counts = np.zeros(ngroups + 1, dtype=np.intp)
    n = len(index)
    indexer = np.zeros(n, dtype=np.intp)
    where = np.zeros(ngroups + 1, dtype=np.intp)

    with nogil:

        # count group sizes, location 0 for NA
        for i in range(n):
            counts[index[i] + 1] += 1

        # mark the start of each contiguous group of like-indexed data
        for i in range(1, ngroups + 1):
            where[i] = where[i - 1] + counts[i - 1]

        # this is our indexer
        for i in range(n):
            label = index[i] + 1
            indexer[where[label]] = i
            where[label] += 1

    return indexer.base, counts.base


cdef inline Py_ssize_t swap(numeric_t *a, numeric_t *b) nogil:
    cdef:
        numeric_t t

    # cython doesn't allow pointer dereference so use array syntax
    t = a[0]
    a[0] = b[0]
    b[0] = t
    return 0


cdef inline numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil:
    """
    See kth_smallest.__doc__. The additional parameter n specifies the maximum
    number of elements considered in arr, needed for compatibility with usage
    in groupby.pyx
    """
    cdef:
        Py_ssize_t i, j, left, m
        numeric_t x

    left = 0
    m = n - 1

    while left < m:
        x = arr[k]
        i = left
        j = m

        while 1:
            while arr[i] < x:
                i += 1
            while x < arr[j]:
                j -= 1
            if i <= j:
                swap(&arr[i], &arr[j])
                i += 1
                j -= 1

            if i > j:
                break

        if j < k:
            left = i
        if k < i:
            m = j
    return arr[k]


@cython.boundscheck(False)
@cython.wraparound(False)
def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
    """
    Compute the kth smallest value in arr. Note that the input
    array will be modified.

    Parameters
    ----------
    arr : numeric[::1]
        Array to compute the kth smallest value for, must be
        contiguous
    k : Py_ssize_t

    Returns
    -------
    numeric
        The kth smallest value in arr
    """
    cdef:
        numeric_t result

    with nogil:
        result = kth_smallest_c(&arr[0], k, arr.shape[0])

    return result


# ----------------------------------------------------------------------
# Pairwise correlation/covariance


@cython.boundscheck(False)
@cython.wraparound(False)
def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
    cdef:
        Py_ssize_t i, j, xi, yi, N, K
        bint minpv
        float64_t[:, ::1] result
        ndarray[uint8_t, ndim=2] mask
        int64_t nobs = 0
        float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy

    N, K = (<object>mat).shape

    if minp is None:
        minpv = 1
    else:
        minpv = <int>minp

    result = np.empty((K, K), dtype=np.float64)
    mask = np.isfinite(mat).view(np.uint8)

    with nogil:
        for xi in range(K):
            for yi in range(xi + 1):
                # Welford's method for the variance-calculation
                # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
                nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
                for i in range(N):
                    if mask[i, xi] and mask[i, yi]:
                        vx = mat[i, xi]
                        vy = mat[i, yi]
                        nobs += 1
                        dx = vx - meanx
                        dy = vy - meany
                        meanx += 1 / nobs * dx
                        meany += 1 / nobs * dy
                        ssqdmx += (vx - meanx) * dx
                        ssqdmy += (vy - meany) * dy
                        covxy += (vx - meanx) * dy

                if nobs < minpv:
                    result[xi, yi] = result[yi, xi] = NaN
                else:
                    divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)

                    if divisor != 0:
                        result[xi, yi] = result[yi, xi] = covxy / divisor
                    else:
                        result[xi, yi] = result[yi, xi] = NaN

    return result.base

# ----------------------------------------------------------------------
# Pairwise Spearman correlation


@cython.boundscheck(False)
@cython.wraparound(False)
def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
    cdef:
        Py_ssize_t i, j, xi, yi, N, K
        ndarray[float64_t, ndim=2] result
        ndarray[float64_t, ndim=2] ranked_mat
        ndarray[float64_t, ndim=1] rankedx, rankedy
        float64_t[::1] maskedx, maskedy
        ndarray[uint8_t, ndim=2] mask
        int64_t nobs = 0
        bint no_nans
        float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor

    N, K = (<object>mat).shape

    # Handle the edge case where we know all results will be nan
    # to keep conditional logic inside loop simpler
    if N < minp:
        result = np.full((K, K), np.nan, dtype=np.float64)
        return result

    result = np.empty((K, K), dtype=np.float64)
    mask = np.isfinite(mat).view(np.uint8)
    no_nans = mask.all()

    ranked_mat = np.empty((N, K), dtype=np.float64)

    # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe
    # here since N >= nobs and values are stored contiguously
    maskedx = np.empty(N, dtype=np.float64)
    maskedy = np.empty(N, dtype=np.float64)
    for i in range(K):
        ranked_mat[:, i] = rank_1d(mat[:, i])

    with nogil:
        for xi in range(K):
            for yi in range(xi + 1):
                sumx = sumxx = sumyy = 0

                # Fastpath for data with no nans/infs, allows avoiding mask checks
                # and array reassignments
                if no_nans:
                    mean = (N + 1) / 2.

                    # now the cov numerator
                    for i in range(N):
                        vx = ranked_mat[i, xi] - mean
                        vy = ranked_mat[i, yi] - mean

                        sumx += vx * vy
                        sumxx += vx * vx
                        sumyy += vy * vy
                else:
                    nobs = 0
                    # Keep track of whether we need to recompute ranks
                    all_ranks = True
                    for i in range(N):
                        all_ranks &= not (mask[i, xi] ^ mask[i, yi])
                        if mask[i, xi] and mask[i, yi]:
                            maskedx[nobs] = ranked_mat[i, xi]
                            maskedy[nobs] = ranked_mat[i, yi]
                            nobs += 1

                    if nobs < minp:
                        result[xi, yi] = result[yi, xi] = NaN
                        continue
                    else:
                        if not all_ranks:
                            with gil:
                                # We need to slice back to nobs because rank_1d will
                                # require arrays of nobs length
                                rankedx = rank_1d(np.asarray(maskedx)[:nobs])
                                rankedy = rank_1d(np.asarray(maskedy)[:nobs])
                            for i in range(nobs):
                                maskedx[i] = rankedx[i]
                                maskedy[i] = rankedy[i]

                        mean = (nobs + 1) / 2.

                        # now the cov numerator
                        for i in range(nobs):
                            vx = maskedx[i] - mean
                            vy = maskedy[i] - mean

                            sumx += vx * vy
                            sumxx += vx * vx
                            sumyy += vy * vy

                divisor = sqrt(sumxx * sumyy)

                if divisor != 0:
                    result[xi, yi] = result[yi, xi] = sumx / divisor
                else:
                    result[xi, yi] = result[yi, xi] = NaN

    return result


# ----------------------------------------------------------------------

def validate_limit(nobs: int | None, limit=None) -> int:
    """
    Check that the `limit` argument is a positive integer.

    Parameters
    ----------
    nobs : int
    limit : object

    Returns
    -------
    int
        The limit.
    """
    if limit is None:
        lim = nobs
    else:
        if not util.is_integer_object(limit):
            raise ValueError('Limit must be an integer')
        if limit < 1:
            raise ValueError('Limit must be greater than 0')
        lim = limit

    return lim


@cython.boundscheck(False)
@cython.wraparound(False)
def pad(
    ndarray[numeric_object_t] old,
    ndarray[numeric_object_t] new,
    limit=None
) -> ndarray:
    # -> ndarray[intp_t, ndim=1]
    cdef:
        Py_ssize_t i, j, nleft, nright
        ndarray[intp_t, ndim=1] indexer
        numeric_object_t cur, next_val
        int lim, fill_count = 0

    nleft = len(old)
    nright = len(new)
    indexer = np.empty(nright, dtype=np.intp)
    indexer[:] = -1

    lim = validate_limit(nright, limit)

    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
        return indexer

    i = j = 0

    cur = old[0]

    while j <= nright - 1 and new[j] < cur:
        j += 1

    while True:
        if j == nright:
            break

        if i == nleft - 1:
            while j < nright:
                if new[j] == cur:
                    indexer[j] = i
                elif new[j] > cur and fill_count < lim:
                    indexer[j] = i
                    fill_count += 1
                j += 1
            break

        next_val = old[i + 1]

        while j < nright and cur <= new[j] < next_val:
            if new[j] == cur:
                indexer[j] = i
            elif fill_count < lim:
                indexer[j] = i
                fill_count += 1
            j += 1

        fill_count = 0
        i += 1
        cur = next_val

    return indexer


@cython.boundscheck(False)
@cython.wraparound(False)
def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
    cdef:
        Py_ssize_t i, N
        numeric_object_t val
        uint8_t prev_mask
        int lim, fill_count = 0

    N = len(values)

    # GH#2778
    if N == 0:
        return

    lim = validate_limit(N, limit)

    val = values[0]
    prev_mask = mask[0]
    for i in range(N):
        if mask[i]:
            if fill_count >= lim:
                continue
            fill_count += 1
            values[i] = val
            mask[i] = prev_mask
        else:
            fill_count = 0
            val = values[i]
            prev_mask = mask[i]


@cython.boundscheck(False)
@cython.wraparound(False)
def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None):
    cdef:
        Py_ssize_t i, j, N, K
        numeric_object_t val
        int lim, fill_count = 0

    K, N = (<object>values).shape

    # GH#2778
    if N == 0:
        return

    lim = validate_limit(N, limit)

    for j in range(K):
        fill_count = 0
        val = values[j, 0]
        for i in range(N):
            if mask[j, i]:
                if fill_count >= lim or i == 0:
                    continue
                fill_count += 1
                values[j, i] = val
                mask[j, i] = False
            else:
                fill_count = 0
                val = values[j, i]


"""
Backfilling logic for generating fill vector

Diagram of what's going on

Old      New    Fill vector    Mask
         .        0               1
         .        0               1
         .        0               1
A        A        0               1
         .        1               1
         .        1               1
         .        1               1
         .        1               1
         .        1               1
B        B        1               1
         .        2               1
         .        2               1
         .        2               1
C        C        2               1
         .                        0
         .                        0
D
"""


@cython.boundscheck(False)
@cython.wraparound(False)
def backfill(
    ndarray[numeric_object_t] old,
    ndarray[numeric_object_t] new,
    limit=None
) -> ndarray:
    # -> ndarray[intp_t, ndim=1]
    cdef:
        Py_ssize_t i, j, nleft, nright
        ndarray[intp_t, ndim=1] indexer
        numeric_object_t cur, prev
        int lim, fill_count = 0

    nleft = len(old)
    nright = len(new)
    indexer = np.empty(nright, dtype=np.intp)
    indexer[:] = -1

    lim = validate_limit(nright, limit)

    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
        return indexer

    i = nleft - 1
    j = nright - 1

    cur = old[nleft - 1]

    while j >= 0 and new[j] > cur:
        j -= 1

    while True:
        if j < 0:
            break

        if i == 0:
            while j >= 0:
                if new[j] == cur:
                    indexer[j] = i
                elif new[j] < cur and fill_count < lim:
                    indexer[j] = i
                    fill_count += 1
                j -= 1
            break

        prev = old[i - 1]

        while j >= 0 and prev < new[j] <= cur:
            if new[j] == cur:
                indexer[j] = i
            elif new[j] < cur and fill_count < lim:
                indexer[j] = i
                fill_count += 1
            j -= 1

        fill_count = 0
        i -= 1
        cur = prev

    return indexer


def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
    pad_inplace(values[::-1], mask[::-1], limit=limit)


def backfill_2d_inplace(numeric_object_t[:, :] values,
                        uint8_t[:, :] mask,
                        limit=None):
    pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)


@cython.boundscheck(False)
@cython.wraparound(False)
def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike):
    """
    Returns
    -------
    tuple
        is_monotonic_inc : bool
        is_monotonic_dec : bool
        is_unique : bool
    """
    cdef:
        Py_ssize_t i, n
        numeric_object_t prev, cur
        bint is_monotonic_inc = 1
        bint is_monotonic_dec = 1
        bint is_unique = 1
        bint is_strict_monotonic = 1

    n = len(arr)

    if n == 1:
        if arr[0] != arr[0] or (timelike and <int64_t>arr[0] == NPY_NAT):
            # single value is NaN
            return False, False, True
        else:
            return True, True, True
    elif n < 2:
        return True, True, True

    if timelike and <int64_t>arr[0] == NPY_NAT:
        return False, False, True

    if numeric_object_t is not object:
        with nogil:
            prev = arr[0]
            for i in range(1, n):
                cur = arr[i]
                if timelike and <int64_t>cur == NPY_NAT:
                    is_monotonic_inc = 0
                    is_monotonic_dec = 0
                    break
                if cur < prev:
                    is_monotonic_inc = 0
                elif cur > prev:
                    is_monotonic_dec = 0
                elif cur == prev:
                    is_unique = 0
                else:
                    # cur or prev is NaN
                    is_monotonic_inc = 0
                    is_monotonic_dec = 0
                    break
                if not is_monotonic_inc and not is_monotonic_dec:
                    is_monotonic_inc = 0
                    is_monotonic_dec = 0
                    break
                prev = cur
    else:
        # object-dtype, identical to above except we cannot use `with nogil`
        prev = arr[0]
        for i in range(1, n):
            cur = arr[i]
            if timelike and <int64_t>cur == NPY_NAT:
                is_monotonic_inc = 0
                is_monotonic_dec = 0
                break
            if cur < prev:
                is_monotonic_inc = 0
            elif cur > prev:
                is_monotonic_dec = 0
            elif cur == prev:
                is_unique = 0
            else:
                # cur or prev is NaN
                is_monotonic_inc = 0
                is_monotonic_dec = 0
                break
            if not is_monotonic_inc and not is_monotonic_dec:
                is_monotonic_inc = 0
                is_monotonic_dec = 0
                break
            prev = cur

    is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
    return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic


# ----------------------------------------------------------------------
# rank_1d, rank_2d
# ----------------------------------------------------------------------

cdef iu_64_floating_obj_t get_rank_nan_fill_val(
        bint rank_nans_highest,
        iu_64_floating_obj_t[:] _=None
):
    """
    Return the value we'll use to represent missing values when sorting depending
    on if we'd like missing values to end up at the top/bottom. (The second parameter
    is unused, but needed for fused type specialization)
    """
    if rank_nans_highest:
        if iu_64_floating_obj_t is object:
            return Infinity()
        elif iu_64_floating_obj_t is int64_t:
            return util.INT64_MAX
        elif iu_64_floating_obj_t is uint64_t:
            return util.UINT64_MAX
        else:
            return np.inf
    else:
        if iu_64_floating_obj_t is object:
            return NegInfinity()
        elif iu_64_floating_obj_t is int64_t:
            return NPY_NAT
        elif iu_64_floating_obj_t is uint64_t:
            return 0
        else:
            return -np.inf


@cython.wraparound(False)
@cython.boundscheck(False)
def rank_1d(
    ndarray[iu_64_floating_obj_t, ndim=1] values,
    const intp_t[:] labels=None,
    bint is_datetimelike=False,
    ties_method="average",
    bint ascending=True,
    bint pct=False,
    na_option="keep",
):
    """
    Fast NaN-friendly version of ``scipy.stats.rankdata``.

    Parameters
    ----------
    values : array of iu_64_floating_obj_t values to be ranked
    labels : np.ndarray[np.intp] or None
        Array containing unique label for each group, with its ordering
        matching up to the corresponding record in `values`. If not called
        from a groupby operation, will be None.
    is_datetimelike : bool, default False
        True if `values` contains datetime-like entries.
    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
        'average'
        * average: average rank of group
        * min: lowest rank in group
        * max: highest rank in group
        * first: ranks assigned in order they appear in the array
        * dense: like 'min', but rank always increases by 1 between groups
    ascending : bool, default True
        False for ranks by high (1) to low (N)
        na_option : {'keep', 'top', 'bottom'}, default 'keep'
    pct : bool, default False
        Compute percentage rank of data within each group
    na_option : {'keep', 'top', 'bottom'}, default 'keep'
        * keep: leave NA values where they are
        * top: smallest rank if ascending
        * bottom: smallest rank if descending
    """
    cdef:
        TiebreakEnumType tiebreak
        Py_ssize_t N
        int64_t[::1] grp_sizes
        intp_t[:] lexsort_indexer
        float64_t[::1] out
        ndarray[iu_64_floating_obj_t, ndim=1] masked_vals
        iu_64_floating_obj_t[:] masked_vals_memview
        uint8_t[:] mask
        bint keep_na, nans_rank_highest, check_labels, check_mask
        iu_64_floating_obj_t nan_fill_val

    tiebreak = tiebreakers[ties_method]
    if tiebreak == TIEBREAK_FIRST:
        if not ascending:
            tiebreak = TIEBREAK_FIRST_DESCENDING

    keep_na = na_option == 'keep'

    N = len(values)
    if labels is not None:
        # TODO(cython3): cast won't be necessary (#2992)
        assert <Py_ssize_t>len(labels) == N
    out = np.empty(N)
    grp_sizes = np.ones(N, dtype=np.int64)

    # If we don't care about labels, can short-circuit later label
    # comparisons
    check_labels = labels is not None

    # For cases where a mask is not possible, we can avoid mask checks
    check_mask = not (iu_64_floating_obj_t is uint64_t or
                      (iu_64_floating_obj_t is int64_t and not is_datetimelike))

    # Copy values into new array in order to fill missing data
    # with mask, without obfuscating location of missing data
    # in values array
    if iu_64_floating_obj_t is object and values.dtype != np.object_:
        masked_vals = values.astype('O')
    else:
        masked_vals = values.copy()

    if iu_64_floating_obj_t is object:
        mask = missing.isnaobj(masked_vals)
    elif iu_64_floating_obj_t is int64_t and is_datetimelike:
        mask = (masked_vals == NPY_NAT).astype(np.uint8)
    elif iu_64_floating_obj_t is float64_t:
        mask = np.isnan(masked_vals).astype(np.uint8)
    else:
        mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)

    # If `na_option == 'top'`, we want to assign the lowest rank
    # to NaN regardless of ascending/descending. So if ascending,
    # fill with lowest value of type to end up with lowest rank.
    # If descending, fill with highest value since descending
    # will flip the ordering to still end up with lowest rank.
    # Symmetric logic applies to `na_option == 'bottom'`
    nans_rank_highest = ascending ^ (na_option == 'top')
    nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest)
    if nans_rank_highest:
        order = [masked_vals, mask]
    else:
        order = [masked_vals, ~(np.asarray(mask))]

    if check_labels:
        order.append(labels)

    np.putmask(masked_vals, mask, nan_fill_val)
    # putmask doesn't accept a memoryview, so we assign as a separate step
    masked_vals_memview = masked_vals

    # lexsort using labels, then mask, then actual values
    # each label corresponds to a different group value,
    # the mask helps you differentiate missing values before
    # performing sort on the actual values
    lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)

    if not ascending:
        lexsort_indexer = lexsort_indexer[::-1]

    with nogil:
        rank_sorted_1d(
            out,
            grp_sizes,
            lexsort_indexer,
            masked_vals_memview,
            mask,
            check_mask=check_mask,
            N=N,
            tiebreak=tiebreak,
            keep_na=keep_na,
            pct=pct,
            labels=labels,
        )

    return np.asarray(out)


@cython.wraparound(False)
@cython.boundscheck(False)
cdef void rank_sorted_1d(
    float64_t[::1] out,
    int64_t[::1] grp_sizes,
    const intp_t[:] sort_indexer,
    # Can make const with cython3 (https://github.com/cython/cython/issues/3222)
    iu_64_floating_obj_t[:] masked_vals,
    const uint8_t[:] mask,
    bint check_mask,
    Py_ssize_t N,
    TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
    bint keep_na=True,
    bint pct=False,
    # https://github.com/cython/cython/issues/1630, only trailing arguments can
    # currently be omitted for cdef functions, which is why we keep this at the end
    const intp_t[:] labels=None,
) nogil:
    """
    See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
    be handled in the caller. Note that `out` and `grp_sizes` are modified inplace.

    Parameters
    ----------
    out : float64_t[::1]
        Array to store computed ranks
    grp_sizes : int64_t[::1]
        Array to store group counts, only used if pct=True. Should only be None
        if labels is None.
    sort_indexer : intp_t[:]
        Array of indices which sorts masked_vals
    masked_vals : iu_64_floating_obj_t[:]
        The values input to rank_1d, with missing values replaced by fill values
    mask : uint8_t[:]
        Array where entries are True if the value is missing, False otherwise.
    check_mask : bool
        If False, assumes the mask is all False to skip mask indexing
    N : Py_ssize_t
        The number of elements to rank. Note: it is not always true that
        N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
    tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
        See rank_1d.__doc__ for the different modes
    keep_na : bool, default True
        Whether or not to keep nulls
    pct : bool, default False
        Compute percentage rank of data within each group
    labels : See rank_1d.__doc__, default None. None implies all labels are the same.
    """

    cdef:
        Py_ssize_t i, j, dups=0, sum_ranks=0,
        Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
        bint at_end, next_val_diff, group_changed, check_labels
        int64_t grp_size

    check_labels = labels is not None

    # Loop over the length of the value array
    # each incremental i value can be looked up in the lexsort_indexer
    # array that we sorted previously, which gives us the location of
    # that sorted value for retrieval back from the original
    # values / masked_vals arrays
    # TODO(cython3): de-duplicate once cython supports conditional nogil
    if iu_64_floating_obj_t is object:
        with gil:
            for i in range(N):
                at_end = i == N - 1

                # dups and sum_ranks will be incremented each loop where
                # the value / group remains the same, and should be reset
                # when either of those change. Used to calculate tiebreakers
                dups += 1
                sum_ranks += i - grp_start + 1

                next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
                                                   masked_vals[sort_indexer[i+1]])

                # We'll need this check later anyway to determine group size, so just
                # compute it here since shortcircuiting won't help
                group_changed = at_end or (check_labels and
                                           (labels[sort_indexer[i]]
                                            != labels[sort_indexer[i+1]]))

                # Update out only when there is a transition of values or labels.
                # When a new value or group is encountered, go back #dups steps(
                # the number of occurrence of current value) and assign the ranks
                # based on the starting index of the current group (grp_start)
                # and the current index
                if (next_val_diff or group_changed or (check_mask and
                                                       (mask[sort_indexer[i]]
                                                        ^ mask[sort_indexer[i+1]]))):

                    # If keep_na, check for missing values and assign back
                    # to the result where appropriate
                    if keep_na and check_mask and mask[sort_indexer[i]]:
                        grp_na_count = dups
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = NaN
                    elif tiebreak == TIEBREAK_AVERAGE:
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = sum_ranks / <float64_t>dups
                    elif tiebreak == TIEBREAK_MIN:
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = i - grp_start - dups + 2
                    elif tiebreak == TIEBREAK_MAX:
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = i - grp_start + 1

                    # With n as the previous rank in the group and m as the number
                    # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
                    # then rankings should be n + 1, n + 2 ... n + m
                    elif tiebreak == TIEBREAK_FIRST:
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = j + 1 - grp_start

                    # If TIEBREAK_FIRST and descending, the ranking should be
                    # n + m, n + (m - 1) ... n + 1. This is equivalent to
                    # (i - dups + 1) + (i - j + 1) - grp_start
                    elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
                    elif tiebreak == TIEBREAK_DENSE:
                        for j in range(i - dups + 1, i + 1):
                            out[sort_indexer[j]] = grp_vals_seen

                    # Look forward to the next value (using the sorting in
                    # lexsort_indexer). If the value does not equal the current
                    # value then we need to reset the dups and sum_ranks, knowing
                    # that a new value is coming up. The conditional also needs
                    # to handle nan equality and the end of iteration. If group
                    # changes we do not record seeing a new value in the group
                    if not group_changed and (next_val_diff or (check_mask and
                                              (mask[sort_indexer[i]]
                                               ^ mask[sort_indexer[i+1]]))):
                        dups = sum_ranks = 0
                        grp_vals_seen += 1

                    # Similar to the previous conditional, check now if we are
                    # moving to a new group. If so, keep track of the index where
                    # the new group occurs, so the tiebreaker calculations can
                    # decrement that from their position. Fill in the size of each
                    # group encountered (used by pct calculations later). Also be
                    # sure to reset any of the items helping to calculate dups
                    if group_changed:

                        # If not dense tiebreak, group size used to compute
                        # percentile will be # of non-null elements in group
                        if tiebreak != TIEBREAK_DENSE:
                            grp_size = i - grp_start + 1 - grp_na_count

                        # Otherwise, it will be the number of distinct values
                        # in the group, subtracting 1 if NaNs are present
                        # since that is a distinct value we shouldn't count
                        else:
                            grp_size = grp_vals_seen - (grp_na_count > 0)

                        for j in range(grp_start, i + 1):
                            grp_sizes[sort_indexer[j]] = grp_size

                        dups = sum_ranks = 0
                        grp_na_count = 0
                        grp_start = i + 1
                        grp_vals_seen = 1
    else:
        for i in range(N):
            at_end = i == N - 1

            # dups and sum_ranks will be incremented each loop where
            # the value / group remains the same, and should be reset
            # when either of those change. Used to calculate tiebreakers
            dups += 1
            sum_ranks += i - grp_start + 1

            next_val_diff = at_end or (masked_vals[sort_indexer[i]]
                                       != masked_vals[sort_indexer[i+1]])

            # We'll need this check later anyway to determine group size, so just
            # compute it here since shortcircuiting won't help
            group_changed = at_end or (check_labels and
                                       (labels[sort_indexer[i]]
                                        != labels[sort_indexer[i+1]]))

            # Update out only when there is a transition of values or labels.
            # When a new value or group is encountered, go back #dups steps(
            # the number of occurrence of current value) and assign the ranks
            # based on the starting index of the current group (grp_start)
            # and the current index
            if (next_val_diff or group_changed
                or (check_mask and
                    (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):

                # If keep_na, check for missing values and assign back
                # to the result where appropriate
                if keep_na and check_mask and mask[sort_indexer[i]]:
                    grp_na_count = dups
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = NaN
                elif tiebreak == TIEBREAK_AVERAGE:
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = sum_ranks / <float64_t>dups
                elif tiebreak == TIEBREAK_MIN:
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = i - grp_start - dups + 2
                elif tiebreak == TIEBREAK_MAX:
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = i - grp_start + 1

                # With n as the previous rank in the group and m as the number
                # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
                # then rankings should be n + 1, n + 2 ... n + m
                elif tiebreak == TIEBREAK_FIRST:
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = j + 1 - grp_start

                # If TIEBREAK_FIRST and descending, the ranking should be
                # n + m, n + (m - 1) ... n + 1. This is equivalent to
                # (i - dups + 1) + (i - j + 1) - grp_start
                elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
                elif tiebreak == TIEBREAK_DENSE:
                    for j in range(i - dups + 1, i + 1):
                        out[sort_indexer[j]] = grp_vals_seen

                # Look forward to the next value (using the sorting in
                # lexsort_indexer). If the value does not equal the current
                # value then we need to reset the dups and sum_ranks, knowing
                # that a new value is coming up. The conditional also needs
                # to handle nan equality and the end of iteration. If group
                # changes we do not record seeing a new value in the group
                if not group_changed and (next_val_diff
                                          or (check_mask and
                                              (mask[sort_indexer[i]]
                                               ^ mask[sort_indexer[i+1]]))):
                    dups = sum_ranks = 0
                    grp_vals_seen += 1

                # Similar to the previous conditional, check now if we are
                # moving to a new group. If so, keep track of the index where
                # the new group occurs, so the tiebreaker calculations can
                # decrement that from their position. Fill in the size of each
                # group encountered (used by pct calculations later). Also be
                # sure to reset any of the items helping to calculate dups
                if group_changed:

                    # If not dense tiebreak, group size used to compute
                    # percentile will be # of non-null elements in group
                    if tiebreak != TIEBREAK_DENSE:
                        grp_size = i - grp_start + 1 - grp_na_count

                    # Otherwise, it will be the number of distinct values
                    # in the group, subtracting 1 if NaNs are present
                    # since that is a distinct value we shouldn't count
                    else:
                        grp_size = grp_vals_seen - (grp_na_count > 0)

                    for j in range(grp_start, i + 1):
                        grp_sizes[sort_indexer[j]] = grp_size

                    dups = sum_ranks = 0
                    grp_na_count = 0
                    grp_start = i + 1
                    grp_vals_seen = 1

    if pct:
        for i in range(N):
            if grp_sizes[i] != 0:
                out[i] = out[i] / grp_sizes[i]


def rank_2d(
    ndarray[iu_64_floating_obj_t, ndim=2] in_arr,
    int axis=0,
    bint is_datetimelike=False,
    ties_method="average",
    bint ascending=True,
    na_option="keep",
    bint pct=False,
):
    """
    Fast NaN-friendly version of ``scipy.stats.rankdata``.
    """
    cdef:
        Py_ssize_t k, n, col
        float64_t[::1, :] out  # Column-major so columns are contiguous
        int64_t[::1] grp_sizes
        ndarray[iu_64_floating_obj_t, ndim=2] values
        iu_64_floating_obj_t[:, :] masked_vals
        intp_t[:, :] sort_indexer
        uint8_t[:, :] mask
        TiebreakEnumType tiebreak
        bint check_mask, keep_na, nans_rank_highest
        iu_64_floating_obj_t nan_fill_val

    tiebreak = tiebreakers[ties_method]
    if tiebreak == TIEBREAK_FIRST:
        if not ascending:
            tiebreak = TIEBREAK_FIRST_DESCENDING

    keep_na = na_option == 'keep'

    # For cases where a mask is not possible, we can avoid mask checks
    check_mask = not (iu_64_floating_obj_t is uint64_t or
                      (iu_64_floating_obj_t is int64_t and not is_datetimelike))

    if axis == 1:
        values = np.asarray(in_arr).T.copy()
    else:
        values = np.asarray(in_arr).copy()

    if iu_64_floating_obj_t is object:
        if values.dtype != np.object_:
            values = values.astype('O')

    nans_rank_highest = ascending ^ (na_option == 'top')
    if check_mask:
        nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest)

        if iu_64_floating_obj_t is object:
            mask = missing.isnaobj2d(values).view(np.uint8)
        elif iu_64_floating_obj_t is float64_t:
            mask = np.isnan(values).view(np.uint8)

        # int64 and datetimelike
        else:
            mask = (values == NPY_NAT).view(np.uint8)
        np.putmask(values, mask, nan_fill_val)
    else:
        mask = np.zeros_like(values, dtype=np.uint8)

    if nans_rank_highest:
        order = (values, mask)
    else:
        order = (values, ~np.asarray(mask))

    n, k = (<object>values).shape
    out = np.empty((n, k), dtype='f8', order='F')
    grp_sizes = np.ones(n, dtype=np.int64)

    # lexsort is slower, so only use if we need to worry about the mask
    if check_mask:
        sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
    else:
        kind = "stable" if ties_method == "first" else None
        sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)

    if not ascending:
        sort_indexer = sort_indexer[::-1, :]

    # putmask doesn't accept a memoryview, so we assign in a separate step
    masked_vals = values
    with nogil:
        for col in range(k):
            rank_sorted_1d(
                out[:, col],
                grp_sizes,
                sort_indexer[:, col],
                masked_vals[:, col],
                mask[:, col],
                check_mask=check_mask,
                N=n,
                tiebreak=tiebreak,
                keep_na=keep_na,
                pct=pct,
            )

    if axis == 1:
        return np.asarray(out.T)
    else:
        return np.asarray(out)


ctypedef fused diff_t:
    float64_t
    float32_t
    int8_t
    int16_t
    int32_t
    int64_t

ctypedef fused out_t:
    float32_t
    float64_t
    int64_t


@cython.boundscheck(False)
@cython.wraparound(False)
def diff_2d(
    ndarray[diff_t, ndim=2] arr,  # TODO(cython3) update to "const diff_t[:, :] arr"
    ndarray[out_t, ndim=2] out,
    Py_ssize_t periods,
    int axis,
    bint datetimelike=False,
):
    cdef:
        Py_ssize_t i, j, sx, sy, start, stop
        bint f_contig = arr.flags.f_contiguous
        # bint f_contig = arr.is_f_contig()  # TODO(cython3)
        diff_t left, right

    # Disable for unsupported dtype combinations,
    #  see https://github.com/cython/cython/issues/2646
    if (out_t is float32_t
            and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
        raise NotImplementedError  # pragma: no cover
    elif (out_t is float64_t
          and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
        raise NotImplementedError  # pragma: no cover
    elif out_t is int64_t and diff_t is not int64_t:
        # We only have out_t of int64_t if we have datetimelike
        raise NotImplementedError  # pragma: no cover
    else:
        # We put this inside an indented else block to avoid cython build
        #  warnings about unreachable code
        sx, sy = (<object>arr).shape
        with nogil:
            if f_contig:
                if axis == 0:
                    if periods >= 0:
                        start, stop = periods, sx
                    else:
                        start, stop = 0, sx + periods
                    for j in range(sy):
                        for i in range(start, stop):
                            left = arr[i, j]
                            right = arr[i - periods, j]
                            if out_t is int64_t and datetimelike:
                                if left == NPY_NAT or right == NPY_NAT:
                                    out[i, j] = NPY_NAT
                                else:
                                    out[i, j] = left - right
                            else:
                                out[i, j] = left - right
                else:
                    if periods >= 0:
                        start, stop = periods, sy
                    else:
                        start, stop = 0, sy + periods
                    for j in range(start, stop):
                        for i in range(sx):
                            left = arr[i, j]
                            right = arr[i, j - periods]
                            if out_t is int64_t and datetimelike:
                                if left == NPY_NAT or right == NPY_NAT:
                                    out[i, j] = NPY_NAT
                                else:
                                    out[i, j] = left - right
                            else:
                                out[i, j] = left - right
            else:
                if axis == 0:
                    if periods >= 0:
                        start, stop = periods, sx
                    else:
                        start, stop = 0, sx + periods
                    for i in range(start, stop):
                        for j in range(sy):
                            left = arr[i, j]
                            right = arr[i - periods, j]
                            if out_t is int64_t and datetimelike:
                                if left == NPY_NAT or right == NPY_NAT:
                                    out[i, j] = NPY_NAT
                                else:
                                    out[i, j] = left - right
                            else:
                                out[i, j] = left - right
                else:
                    if periods >= 0:
                        start, stop = periods, sy
                    else:
                        start, stop = 0, sy + periods
                    for i in range(sx):
                        for j in range(start, stop):
                            left = arr[i, j]
                            right = arr[i, j - periods]
                            if out_t is int64_t and datetimelike:
                                if left == NPY_NAT or right == NPY_NAT:
                                    out[i, j] = NPY_NAT
                                else:
                                    out[i, j] = left - right
                            else:
                                out[i, j] = left - right


# generated from template
include "algos_common_helper.pxi"
include "algos_take_helper.pxi"