usse/scrape/venv/lib/python3.10/site-packages/spellchecker/spellchecker.py

""" SpellChecker Module; simple, intuitive spell checker based on the post by
    Peter Norvig. See: https://norvig.com/spell-correct.html """
import gzip
import json
import pkgutil
import string
import typing
from collections import Counter
from collections.abc import Iterable

from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file


class SpellChecker:
    """The SpellChecker class encapsulates the basics needed to accomplish a
    simple spell checking algorithm. It is based on the work by
    Peter Norvig (https://norvig.com/spell-correct.html)

    Args:
        language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
            `en`, `es`, `de`, `fr`, `pt`, `ru`, `lv`, and `eu`. Defaults to `en`. A list of languages may be provided and all \
                languages will be loaded.
        local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
            will be loaded
        distance (int): The edit distance to use. Defaults to 2.
        case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
            language dictionary.
    Note:
        Using a case sensitive dictionary can be slow to correct words."""

    __slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]

    def __init__(
        self,
        language: typing.Union[str, typing.Iterable[str]] = "en",
        local_dictionary: typing.Optional[str] = None,
        distance: int = 2,
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
        case_sensitive: bool = False,
    ) -> None:
        self._distance = 2  # default
        self.distance = distance  # use the setter value check

        if tokenizer:
            self._tokenizer = tokenizer
        else:
            self._tokenizer = _parse_into_words

        self._case_sensitive = case_sensitive if not language else False
        self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)

        if local_dictionary:
            self._word_frequency.load_dictionary(local_dictionary)
        elif language:
            if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
                language = [language]  # type: ignore
            for lang in language:
                filename = f"resources/{lang.lower()}.json.gz"
                try:
                    json_open = pkgutil.get_data("spellchecker", filename)
                except FileNotFoundError as exc:
                    msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
                    raise ValueError(msg) from exc
                if json_open:
                    lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
                self._word_frequency.load_json(lang_dict)

    def __contains__(self, key: KeyT) -> bool:
        """setup easier known checks"""
        key = ensure_unicode(key)
        return key in self._word_frequency

    def __getitem__(self, key: KeyT) -> int:
        """setup easier frequency checks"""
        key = ensure_unicode(key)
        return self._word_frequency[key]

    def __iter__(self) -> typing.Generator[str, None, None]:
        """setup iter support"""
        yield from self._word_frequency.dictionary

    @classmethod
    def languages(cls) -> typing.Iterable[str]:
        """list: A list of all official languages supported by the library"""
        return ["de", "en", "es", "fr", "pt", "ru", "ar", "lv", "eu"]

    @property
    def word_frequency(self) -> "WordFrequency":
        """WordFrequency: An encapsulation of the word frequency `dictionary`

        Note:
            Not settable"""
        return self._word_frequency

    @property
    def distance(self) -> int:
        """int: The maximum edit distance to calculate

        Note:
            Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""
        return self._distance

    @distance.setter
    def distance(self, val: int) -> None:
        """set the distance parameter"""
        tmp = 2
        try:
            if 0 < int(val) <= 2:
                tmp = val
        except (ValueError, TypeError):
            pass
        self._distance = tmp

    def split_words(self, text: KeyT) -> typing.Iterable[str]:
        """Split text into individual `words` using either a simple whitespace
        regex or the passed in tokenizer

        Args:
            text (str): The text to split into individual words
        Returns:
            list(str): A listing of all words in the provided text"""
        text = ensure_unicode(text)
        return self._tokenizer(text)

    def export(self, filepath: str, encoding: str = "utf-8", gzipped: bool = True) -> None:
        """Export the word frequency list for import in the future

        Args:
           filepath (str): The filepath to the exported dictionary
           encoding (str): The encoding of the resulting output
           gzipped (bool): Whether to gzip the dictionary or not"""
        data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
        write_file(filepath, encoding, gzipped, data)

    def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
        """Calculate the frequency to the `word` provided as seen across the
        entire dictionary

        Args:
            word (str): The word for which the word probability is calculated
            total_words (int): The total number of words to use in the calculation; \
                use the default for using the whole word frequency
        Returns:
            float: The probability that the word is the correct word"""
        if not total_words:
            total_words = self._word_frequency.total_words
        word = ensure_unicode(word)
        return self._word_frequency.dictionary[word] / total_words

    def correction(self, word: KeyT) -> typing.Optional[str]:
        """The most probable correct spelling for the word

        Args:
            word (str): The word to correct
        Returns:
            str: The most likely candidate or None if no correction is present"""
        word = ensure_unicode(word)
        candidates = self.candidates(word)
        if not candidates:
            return None
        return max(sorted(list(candidates)), key=self.__getitem__)

    def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
        """Generate possible spelling corrections for the provided word up to
        an edit distance of two, if and only when needed

        Args:
            word (str): The word for which to calculate candidate spellings
        Returns:
            set: The set of words that are possible candidates or None if there are no candidates"""
        word = ensure_unicode(word)
        if self.known([word]):  # short-cut if word is correct already
            return {word}

        if not self._check_if_should_check(word):
            return {word}

        # get edit distance 1...
        res = list(self.edit_distance_1(word))
        tmp = self.known(res)
        if tmp:
            return tmp
        # if still not found, use the edit distance 1 to calc edit distance 2
        if self._distance == 2:
            tmp = self.known(list(self.__edit_distance_alt(res)))
            if tmp:
                return tmp
        return None

    def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
        """The subset of `words` that appear in the dictionary of words

        Args:
            words (list): List of words to determine which are in the corpus
        Returns:
            set: The set of those words from the input that are in the corpus"""
        tmp_words = [ensure_unicode(w) for w in words]
        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
        return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}

    def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
        """The subset of `words` that do not appear in the dictionary

        Args:
            words (list): List of words to determine which are not in the corpus
        Returns:
            set: The set of those words from the input that are not in the corpus"""
        tmp_words = [ensure_unicode(w) for w in words]
        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
        return {w for w in tmp if w not in self._word_frequency.dictionary}

    def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
        """Compute all strings that are one edit away from `word` using only
        the letters in the corpus

        Args:
            word (str): The word for which to calculate the edit distance
        Returns:
            set: The set of strings that are edit distance one from the provided word"""
        tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
        if self._check_if_should_check(tmp_word) is False:
            return {tmp_word}
        letters = self._word_frequency.letters
        splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edit_distance_2(self, word: KeyT) -> typing.List[str]:
        """Compute all strings that are two edits away from `word` using only
        the letters in the corpus

        Args:
            word (str): The word for which to calculate the edit distance
        Returns:
            set: The set of strings that are edit distance two from the provided word"""
        word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
        return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]

    def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
        """Compute all strings that are 1 edits away from all the words using
        only the letters in the corpus

        Args:
            words (list): The words for which to calculate the edit distance
        Returns:
            set: The set of strings that are edit distance two from the provided words"""
        tmp_words = [ensure_unicode(w) for w in words]
        tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
        return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]

    def _check_if_should_check(self, word: str) -> bool:
        if len(word) == 1 and word in string.punctuation:
            return False
        if len(word) > self._word_frequency.longest_word_length + 3:  # allow removal of up to 2 letters
            return False
        if word.lower() == "nan":  # nan passes the float(word) so this will bypass that issue (#125)
            return True
        try:  # check if it is a number (int, float, etc)
            float(word)
            return False
        except ValueError:
            pass

        return True


class WordFrequency:
    """Store the `dictionary` as a word frequency list while allowing for
    different methods to load the data and update over time"""

    __slots__ = [
        "_dictionary",
        "_total_words",
        "_unique_words",
        "_letters",
        "_tokenizer",
        "_case_sensitive",
        "_longest_word_length",
    ]

    def __init__(self, tokenizer=None, case_sensitive=False):
        self._dictionary = Counter()
        self._total_words = 0
        self._unique_words = 0
        self._letters = set()
        self._case_sensitive = case_sensitive
        self._longest_word_length = 0

        self._tokenizer = _parse_into_words
        if tokenizer is not None:
            self._tokenizer = tokenizer

    def __contains__(self, key: KeyT) -> bool:
        """turn on contains"""
        key = ensure_unicode(key)
        key = key if self._case_sensitive else key.lower()
        return key in self._dictionary

    def __getitem__(self, key: KeyT) -> int:
        """turn on getitem"""
        key = ensure_unicode(key)
        key = key if self._case_sensitive else key.lower()
        return self._dictionary[key]

    def __iter__(self) -> typing.Generator[str, None, None]:
        """turn on iter support"""
        yield from self._dictionary

    def pop(self, key: KeyT, default: typing.Optional[int] = None) -> int:
        """Remove the key and return the associated value or default if not
        found

        Args:
            key (str): The key to remove
            default (obj): The value to return if key is not present"""
        key = ensure_unicode(key)
        return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)

    @property
    def dictionary(self) -> typing.Dict[str, int]:
        """Counter: A counting dictionary of all words in the corpus and the number
        of times each has been seen

        Note:
            Not settable"""
        return self._dictionary

    @property
    def total_words(self) -> int:
        """int: The sum of all word occurances in the word frequency dictionary

        Note:
            Not settable"""
        return self._total_words

    @property
    def unique_words(self) -> int:
        """int: The total number of unique words in the word frequency list

        Note:
            Not settable"""
        return self._unique_words

    @property
    def letters(self) -> typing.Set[str]:
        """set: The listing of all letters found within the corpus

        Note:
            Not settable"""
        return self._letters

    @property
    def longest_word_length(self) -> int:
        """int: The longest word length in the dictionary

        Note:
            Not settable"""
        return self._longest_word_length

    def tokenize(self, text: KeyT) -> typing.Generator[str, None, None]:
        """Tokenize the provided string object into individual words

        Args:
            text (str): The string object to tokenize
        Yields:
            str: The next `word` in the tokenized string
        Note:
            This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
        tmp_text = ensure_unicode(text)
        for word in self._tokenizer(tmp_text):
            yield word if self._case_sensitive else word.lower()

    def keys(self) -> typing.Generator[str, None, None]:
        """Iterator over the key of the dictionary

        Yields:
            str: The next key in the dictionary
        Note:
            This is the same as `spellchecker.words()`"""
        yield from self._dictionary.keys()

    def words(self) -> typing.Generator[str, None, None]:
        """Iterator over the words in the dictionary

        Yields:
            str: The next word in the dictionary
        Note:
            This is the same as `spellchecker.keys()`"""
        yield from self._dictionary.keys()

    def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
        """Iterator over the words in the dictionary

        Yields:
            str: The next word in the dictionary
            int: The number of instances in the dictionary
        Note:
            This is the same as `dict.items()`"""
        yield from self._dictionary.items()

    def load_dictionary(self, filename: str, encoding: str = "utf-8") -> None:
        """Load in a pre-built word frequency list

        Args:
            filename (str): The filepath to the json (optionally gzipped) file to be loaded
            encoding (str): The encoding of the dictionary"""
        with load_file(filename, encoding) as data:
            data = data if self._case_sensitive else data.lower()
            self._dictionary.update(json.loads(data))
            self._update_dictionary()

    def load_json(self, data: typing.Dict[str, int]) -> None:
        """Load in a pre-built word frequency list

        Args:
            data (dict): The dictionary to be loaded"""
        self._dictionary.update(data)
        self._update_dictionary()

    def load_text_file(
        self,
        filename: str,
        encoding: str = "utf-8",
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
    ) -> None:
        """Load in a text file from which to generate a word frequency list

        Args:
            filename (str): The filepath to the text file to be loaded
            encoding (str): The encoding of the text file
            tokenizer (function): The function to use to tokenize a string
        """
        with load_file(filename, encoding=encoding) as data:
            self.load_text(data, tokenizer)

    def load_text(
        self,
        text: KeyT,
        tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
    ) -> None:
        """Load text from which to generate a word frequency list

        Args:
            text (str): The text to be loaded
            tokenizer (function): The function to use to tokenize a string
        """
        text = ensure_unicode(text)
        if tokenizer:
            words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
        else:
            words = self.tokenize(text)  # type: ignore

        self._dictionary.update(words)
        self._update_dictionary()

    def load_words(self, words: typing.Iterable[KeyT]) -> None:
        """Load a list of words from which to generate a word frequency list

        Args:
            words (list): The list of words to be loaded"""
        words = [ensure_unicode(w) for w in words]
        self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
        self._update_dictionary()

    def add(self, word: KeyT, val: int = 1) -> None:
        """Add a word to the word frequency list

        Args:
            word (str): The word to add
            val (int): The number of times to insert the word"""
        word = ensure_unicode(word)
        self.load_json({word if self._case_sensitive else word.lower(): val})

    def remove_words(self, words: typing.Iterable[KeyT]) -> None:
        """Remove a list of words from the word frequency list

        Args:
            words (list): The list of words to remove"""
        words = [ensure_unicode(w) for w in words]
        for word in words:
            self.pop(word)
        self._update_dictionary()

    def remove(self, word: KeyT) -> None:
        """Remove a word from the word frequency list

        Args:
            word (str): The word to remove"""
        self.pop(word)
        self._update_dictionary()

    def remove_by_threshold(self, threshold: int = 5) -> None:
        """Remove all words at, or below, the provided threshold

        Args:
            threshold (int): The threshold at which a word is to be removed"""
        to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
        self.remove_words(to_remove)

    def _update_dictionary(self) -> None:
        """Update the word frequency object"""
        self._longest_word_length = 0
        self._total_words = sum(self._dictionary.values())
        self._unique_words = len(self._dictionary.keys())
        self._letters = set()
        for key in self._dictionary:
            if len(key) > self._longest_word_length:
                self._longest_word_length = len(key)
            self._letters.update(key)
update 2023-12-22 14:26:01 +00:00			`""" SpellChecker Module; simple, intuitive spell checker based on the post by`
			`Peter Norvig. See: https://norvig.com/spell-correct.html """`
			`import gzip`
			`import json`
			`import pkgutil`
			`import string`
			`import typing`
			`from collections import Counter`
			`from collections.abc import Iterable`

			`from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file`


			`class SpellChecker:`
			`"""The SpellChecker class encapsulates the basics needed to accomplish a`
			`simple spell checking algorithm. It is based on the work by`
			`Peter Norvig (https://norvig.com/spell-correct.html)`

			`Args:`
			`language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \`
			`en`, `es`, `de`, `fr`, `pt`, `ru`, `lv`, and `eu`. Defaults to `en`. A list of languages may be provided and all \
			`languages will be loaded.`
			`local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \`
			`will be loaded`
			`distance (int): The edit distance to use. Defaults to 2.`
			`case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \`
			`language dictionary.`
			`Note:`
			`Using a case sensitive dictionary can be slow to correct words."""`

			`__slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]`

			`def __init__(`
			`self,`
			`language: typing.Union[str, typing.Iterable[str]] = "en",`
			`local_dictionary: typing.Optional[str] = None,`
			`distance: int = 2,`
			`tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,`
			`case_sensitive: bool = False,`
			`) -> None:`
			`self._distance = 2 # default`
			`self.distance = distance # use the setter value check`

			`if tokenizer:`
			`self._tokenizer = tokenizer`
			`else:`
			`self._tokenizer = _parse_into_words`

			`self._case_sensitive = case_sensitive if not language else False`
			`self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)`

			`if local_dictionary:`
			`self._word_frequency.load_dictionary(local_dictionary)`
			`elif language:`
			`if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):`
			`language = [language] # type: ignore`
			`for lang in language:`
			`filename = f"resources/{lang.lower()}.json.gz"`
			`try:`
			`json_open = pkgutil.get_data("spellchecker", filename)`
			`except FileNotFoundError as exc:`
			`msg = f"The provided dictionary language ({lang.lower()}) does not exist!"`
			`raise ValueError(msg) from exc`
			`if json_open:`
			`lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))`
			`self._word_frequency.load_json(lang_dict)`

			`def __contains__(self, key: KeyT) -> bool:`
			`"""setup easier known checks"""`
			`key = ensure_unicode(key)`
			`return key in self._word_frequency`

			`def __getitem__(self, key: KeyT) -> int:`
			`"""setup easier frequency checks"""`
			`key = ensure_unicode(key)`
			`return self._word_frequency[key]`

			`def __iter__(self) -> typing.Generator[str, None, None]:`
			`"""setup iter support"""`
			`yield from self._word_frequency.dictionary`

			`@classmethod`
			`def languages(cls) -> typing.Iterable[str]:`
			`"""list: A list of all official languages supported by the library"""`
			`return ["de", "en", "es", "fr", "pt", "ru", "ar", "lv", "eu"]`

			`@property`
			`def word_frequency(self) -> "WordFrequency":`
			"""WordFrequency: An encapsulation of the word frequency `dictionary`

			`Note:`
			`Not settable"""`
			`return self._word_frequency`

			`@property`
			`def distance(self) -> int:`
			`"""int: The maximum edit distance to calculate`

			`Note:`
			`Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""`
			`return self._distance`

			`@distance.setter`
			`def distance(self, val: int) -> None:`
			`"""set the distance parameter"""`
			`tmp = 2`
			`try:`
			`if 0 < int(val) <= 2:`
			`tmp = val`
			`except (ValueError, TypeError):`
			`pass`
			`self._distance = tmp`

			`def split_words(self, text: KeyT) -> typing.Iterable[str]:`
			"""Split text into individual `words` using either a simple whitespace
			`regex or the passed in tokenizer`

			`Args:`
			`text (str): The text to split into individual words`
			`Returns:`
			`list(str): A listing of all words in the provided text"""`
			`text = ensure_unicode(text)`
			`return self._tokenizer(text)`

			`def export(self, filepath: str, encoding: str = "utf-8", gzipped: bool = True) -> None:`
			`"""Export the word frequency list for import in the future`

			`Args:`
			`filepath (str): The filepath to the exported dictionary`
			`encoding (str): The encoding of the resulting output`
			`gzipped (bool): Whether to gzip the dictionary or not"""`
			`data = json.dumps(self.word_frequency.dictionary, sort_keys=True)`
			`write_file(filepath, encoding, gzipped, data)`

			`def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:`
			"""Calculate the frequency to the `word` provided as seen across the
			`entire dictionary`

			`Args:`
			`word (str): The word for which the word probability is calculated`
			`total_words (int): The total number of words to use in the calculation; \`
			`use the default for using the whole word frequency`
			`Returns:`
			`float: The probability that the word is the correct word"""`
			`if not total_words:`
			`total_words = self._word_frequency.total_words`
			`word = ensure_unicode(word)`
			`return self._word_frequency.dictionary[word] / total_words`

			`def correction(self, word: KeyT) -> typing.Optional[str]:`
			`"""The most probable correct spelling for the word`

			`Args:`
			`word (str): The word to correct`
			`Returns:`
			`str: The most likely candidate or None if no correction is present"""`
			`word = ensure_unicode(word)`
			`candidates = self.candidates(word)`
			`if not candidates:`
			`return None`
			`return max(sorted(list(candidates)), key=self.__getitem__)`

			`def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:`
			`"""Generate possible spelling corrections for the provided word up to`
			`an edit distance of two, if and only when needed`

			`Args:`
			`word (str): The word for which to calculate candidate spellings`
			`Returns:`
			`set: The set of words that are possible candidates or None if there are no candidates"""`
			`word = ensure_unicode(word)`
			`if self.known([word]): # short-cut if word is correct already`
			`return {word}`

			`if not self._check_if_should_check(word):`
			`return {word}`

			`# get edit distance 1...`
			`res = list(self.edit_distance_1(word))`
			`tmp = self.known(res)`
			`if tmp:`
			`return tmp`
			`# if still not found, use the edit distance 1 to calc edit distance 2`
			`if self._distance == 2:`
			`tmp = self.known(list(self.__edit_distance_alt(res)))`
			`if tmp:`
			`return tmp`
			`return None`

			`def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:`
			"""The subset of `words` that appear in the dictionary of words

			`Args:`
			`words (list): List of words to determine which are in the corpus`
			`Returns:`
			`set: The set of those words from the input that are in the corpus"""`
			`tmp_words = [ensure_unicode(w) for w in words]`
			`tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]`
			`return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}`

			`def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:`
			"""The subset of `words` that do not appear in the dictionary

			`Args:`
			`words (list): List of words to determine which are not in the corpus`
			`Returns:`
			`set: The set of those words from the input that are not in the corpus"""`
			`tmp_words = [ensure_unicode(w) for w in words]`
			`tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]`
			`return {w for w in tmp if w not in self._word_frequency.dictionary}`

			`def edit_distance_1(self, word: KeyT) -> typing.Set[str]:`
			"""Compute all strings that are one edit away from `word` using only
			`the letters in the corpus`

			`Args:`
			`word (str): The word for which to calculate the edit distance`
			`Returns:`
			`set: The set of strings that are edit distance one from the provided word"""`
			`tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)`
			`if self._check_if_should_check(tmp_word) is False:`
			`return {tmp_word}`
			`letters = self._word_frequency.letters`
			`splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]`
			`deletes = [L + R[1:] for L, R in splits if R]`
			`transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]`
			`replaces = [L + c + R[1:] for L, R in splits if R for c in letters]`
			`inserts = [L + c + R for L, R in splits for c in letters]`
			`return set(deletes + transposes + replaces + inserts)`

			`def edit_distance_2(self, word: KeyT) -> typing.List[str]:`
			"""Compute all strings that are two edits away from `word` using only
			`the letters in the corpus`

			`Args:`
			`word (str): The word for which to calculate the edit distance`
			`Returns:`
			`set: The set of strings that are edit distance two from the provided word"""`
			`word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)`
			`return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]`

			`def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:`
			`"""Compute all strings that are 1 edits away from all the words using`
			`only the letters in the corpus`

			`Args:`
			`words (list): The words for which to calculate the edit distance`
			`Returns:`
			`set: The set of strings that are edit distance two from the provided words"""`
			`tmp_words = [ensure_unicode(w) for w in words]`
			`tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]`
			`return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]`

			`def _check_if_should_check(self, word: str) -> bool:`
			`if len(word) == 1 and word in string.punctuation:`
			`return False`
			`if len(word) > self._word_frequency.longest_word_length + 3: # allow removal of up to 2 letters`
			`return False`
			`if word.lower() == "nan": # nan passes the float(word) so this will bypass that issue (#125)`
			`return True`
			`try: # check if it is a number (int, float, etc)`
			`float(word)`
			`return False`
			`except ValueError:`
			`pass`

			`return True`


			`class WordFrequency:`
			"""Store the `dictionary` as a word frequency list while allowing for
			`different methods to load the data and update over time"""`

			`__slots__ = [`
			`"_dictionary",`
			`"_total_words",`
			`"_unique_words",`
			`"_letters",`
			`"_tokenizer",`
			`"_case_sensitive",`
			`"_longest_word_length",`
			`]`

			`def __init__(self, tokenizer=None, case_sensitive=False):`
			`self._dictionary = Counter()`
			`self._total_words = 0`
			`self._unique_words = 0`
			`self._letters = set()`
			`self._case_sensitive = case_sensitive`
			`self._longest_word_length = 0`

			`self._tokenizer = _parse_into_words`
			`if tokenizer is not None:`
			`self._tokenizer = tokenizer`

			`def __contains__(self, key: KeyT) -> bool:`
			`"""turn on contains"""`
			`key = ensure_unicode(key)`
			`key = key if self._case_sensitive else key.lower()`
			`return key in self._dictionary`

			`def __getitem__(self, key: KeyT) -> int:`
			`"""turn on getitem"""`
			`key = ensure_unicode(key)`
			`key = key if self._case_sensitive else key.lower()`
			`return self._dictionary[key]`

			`def __iter__(self) -> typing.Generator[str, None, None]:`
			`"""turn on iter support"""`
			`yield from self._dictionary`

			`def pop(self, key: KeyT, default: typing.Optional[int] = None) -> int:`
			`"""Remove the key and return the associated value or default if not`
			`found`

			`Args:`
			`key (str): The key to remove`
			`default (obj): The value to return if key is not present"""`
			`key = ensure_unicode(key)`
			`return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)`

			`@property`
			`def dictionary(self) -> typing.Dict[str, int]:`
			`"""Counter: A counting dictionary of all words in the corpus and the number`
			`of times each has been seen`

			`Note:`
			`Not settable"""`
			`return self._dictionary`

			`@property`
			`def total_words(self) -> int:`
			`"""int: The sum of all word occurances in the word frequency dictionary`

			`Note:`
			`Not settable"""`
			`return self._total_words`

			`@property`
			`def unique_words(self) -> int:`
			`"""int: The total number of unique words in the word frequency list`

			`Note:`
			`Not settable"""`
			`return self._unique_words`

			`@property`
			`def letters(self) -> typing.Set[str]:`
			`"""set: The listing of all letters found within the corpus`

			`Note:`
			`Not settable"""`
			`return self._letters`

			`@property`
			`def longest_word_length(self) -> int:`
			`"""int: The longest word length in the dictionary`

			`Note:`
			`Not settable"""`
			`return self._longest_word_length`

			`def tokenize(self, text: KeyT) -> typing.Generator[str, None, None]:`
			`"""Tokenize the provided string object into individual words`

			`Args:`
			`text (str): The string object to tokenize`
			`Yields:`
			str: The next `word` in the tokenized string
			`Note:`
			This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
			`tmp_text = ensure_unicode(text)`
			`for word in self._tokenizer(tmp_text):`
			`yield word if self._case_sensitive else word.lower()`

			`def keys(self) -> typing.Generator[str, None, None]:`
			`"""Iterator over the key of the dictionary`

			`Yields:`
			`str: The next key in the dictionary`
			`Note:`
			This is the same as `spellchecker.words()`"""
			`yield from self._dictionary.keys()`

			`def words(self) -> typing.Generator[str, None, None]:`
			`"""Iterator over the words in the dictionary`

			`Yields:`
			`str: The next word in the dictionary`
			`Note:`
			This is the same as `spellchecker.keys()`"""
			`yield from self._dictionary.keys()`

			`def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:`
			`"""Iterator over the words in the dictionary`

			`Yields:`
			`str: The next word in the dictionary`
			`int: The number of instances in the dictionary`
			`Note:`
			This is the same as `dict.items()`"""
			`yield from self._dictionary.items()`

			`def load_dictionary(self, filename: str, encoding: str = "utf-8") -> None:`
			`"""Load in a pre-built word frequency list`

			`Args:`
			`filename (str): The filepath to the json (optionally gzipped) file to be loaded`
			`encoding (str): The encoding of the dictionary"""`
			`with load_file(filename, encoding) as data:`
			`data = data if self._case_sensitive else data.lower()`
			`self._dictionary.update(json.loads(data))`
			`self._update_dictionary()`

			`def load_json(self, data: typing.Dict[str, int]) -> None:`
			`"""Load in a pre-built word frequency list`

			`Args:`
			`data (dict): The dictionary to be loaded"""`
			`self._dictionary.update(data)`
			`self._update_dictionary()`

			`def load_text_file(`
			`self,`
			`filename: str,`
			`encoding: str = "utf-8",`
			`tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,`
			`) -> None:`
			`"""Load in a text file from which to generate a word frequency list`

			`Args:`
			`filename (str): The filepath to the text file to be loaded`
			`encoding (str): The encoding of the text file`
			`tokenizer (function): The function to use to tokenize a string`
			`"""`
			`with load_file(filename, encoding=encoding) as data:`
			`self.load_text(data, tokenizer)`

			`def load_text(`
			`self,`
			`text: KeyT,`
			`tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,`
			`) -> None:`
			`"""Load text from which to generate a word frequency list`

			`Args:`
			`text (str): The text to be loaded`
			`tokenizer (function): The function to use to tokenize a string`
			`"""`
			`text = ensure_unicode(text)`
			`if tokenizer:`
			`words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]`
			`else:`
			`words = self.tokenize(text) # type: ignore`

			`self._dictionary.update(words)`
			`self._update_dictionary()`

			`def load_words(self, words: typing.Iterable[KeyT]) -> None:`
			`"""Load a list of words from which to generate a word frequency list`

			`Args:`
			`words (list): The list of words to be loaded"""`
			`words = [ensure_unicode(w) for w in words]`
			`self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])`
			`self._update_dictionary()`

			`def add(self, word: KeyT, val: int = 1) -> None:`
			`"""Add a word to the word frequency list`

			`Args:`
			`word (str): The word to add`
			`val (int): The number of times to insert the word"""`
			`word = ensure_unicode(word)`
			`self.load_json({word if self._case_sensitive else word.lower(): val})`

			`def remove_words(self, words: typing.Iterable[KeyT]) -> None:`
			`"""Remove a list of words from the word frequency list`

			`Args:`
			`words (list): The list of words to remove"""`
			`words = [ensure_unicode(w) for w in words]`
			`for word in words:`
			`self.pop(word)`
			`self._update_dictionary()`

			`def remove(self, word: KeyT) -> None:`
			`"""Remove a word from the word frequency list`

			`Args:`
			`word (str): The word to remove"""`
			`self.pop(word)`
			`self._update_dictionary()`

			`def remove_by_threshold(self, threshold: int = 5) -> None:`
			`"""Remove all words at, or below, the provided threshold`

			`Args:`
			`threshold (int): The threshold at which a word is to be removed"""`
			`to_remove = [k for k, v in self._dictionary.items() if v <= threshold]`
			`self.remove_words(to_remove)`

			`def _update_dictionary(self) -> None:`
			`"""Update the word frequency object"""`
			`self._longest_word_length = 0`
			`self._total_words = sum(self._dictionary.values())`
			`self._unique_words = len(self._dictionary.keys())`
			`self._letters = set()`
			`for key in self._dictionary:`
			`if len(key) > self._longest_word_length:`
			`self._longest_word_length = len(key)`
			`self._letters.update(key)`