usse/scrape/venv/lib/python3.10/site-packages/spellchecker/spellchecker.py

513 lines
19 KiB
Python
Raw Normal View History

2023-12-22 14:26:01 +00:00
""" SpellChecker Module; simple, intuitive spell checker based on the post by
Peter Norvig. See: https://norvig.com/spell-correct.html """
import gzip
import json
import pkgutil
import string
import typing
from collections import Counter
from collections.abc import Iterable
from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file
class SpellChecker:
"""The SpellChecker class encapsulates the basics needed to accomplish a
simple spell checking algorithm. It is based on the work by
Peter Norvig (https://norvig.com/spell-correct.html)
Args:
language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
`en`, `es`, `de`, `fr`, `pt`, `ru`, `lv`, and `eu`. Defaults to `en`. A list of languages may be provided and all \
languages will be loaded.
local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
will be loaded
distance (int): The edit distance to use. Defaults to 2.
case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
language dictionary.
Note:
Using a case sensitive dictionary can be slow to correct words."""
__slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
def __init__(
self,
language: typing.Union[str, typing.Iterable[str]] = "en",
local_dictionary: typing.Optional[str] = None,
distance: int = 2,
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
case_sensitive: bool = False,
) -> None:
self._distance = 2 # default
self.distance = distance # use the setter value check
if tokenizer:
self._tokenizer = tokenizer
else:
self._tokenizer = _parse_into_words
self._case_sensitive = case_sensitive if not language else False
self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
if local_dictionary:
self._word_frequency.load_dictionary(local_dictionary)
elif language:
if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
language = [language] # type: ignore
for lang in language:
filename = f"resources/{lang.lower()}.json.gz"
try:
json_open = pkgutil.get_data("spellchecker", filename)
except FileNotFoundError as exc:
msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
raise ValueError(msg) from exc
if json_open:
lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
self._word_frequency.load_json(lang_dict)
def __contains__(self, key: KeyT) -> bool:
"""setup easier known checks"""
key = ensure_unicode(key)
return key in self._word_frequency
def __getitem__(self, key: KeyT) -> int:
"""setup easier frequency checks"""
key = ensure_unicode(key)
return self._word_frequency[key]
def __iter__(self) -> typing.Generator[str, None, None]:
"""setup iter support"""
yield from self._word_frequency.dictionary
@classmethod
def languages(cls) -> typing.Iterable[str]:
"""list: A list of all official languages supported by the library"""
return ["de", "en", "es", "fr", "pt", "ru", "ar", "lv", "eu"]
@property
def word_frequency(self) -> "WordFrequency":
"""WordFrequency: An encapsulation of the word frequency `dictionary`
Note:
Not settable"""
return self._word_frequency
@property
def distance(self) -> int:
"""int: The maximum edit distance to calculate
Note:
Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""
return self._distance
@distance.setter
def distance(self, val: int) -> None:
"""set the distance parameter"""
tmp = 2
try:
if 0 < int(val) <= 2:
tmp = val
except (ValueError, TypeError):
pass
self._distance = tmp
def split_words(self, text: KeyT) -> typing.Iterable[str]:
"""Split text into individual `words` using either a simple whitespace
regex or the passed in tokenizer
Args:
text (str): The text to split into individual words
Returns:
list(str): A listing of all words in the provided text"""
text = ensure_unicode(text)
return self._tokenizer(text)
def export(self, filepath: str, encoding: str = "utf-8", gzipped: bool = True) -> None:
"""Export the word frequency list for import in the future
Args:
filepath (str): The filepath to the exported dictionary
encoding (str): The encoding of the resulting output
gzipped (bool): Whether to gzip the dictionary or not"""
data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
write_file(filepath, encoding, gzipped, data)
def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
"""Calculate the frequency to the `word` provided as seen across the
entire dictionary
Args:
word (str): The word for which the word probability is calculated
total_words (int): The total number of words to use in the calculation; \
use the default for using the whole word frequency
Returns:
float: The probability that the word is the correct word"""
if not total_words:
total_words = self._word_frequency.total_words
word = ensure_unicode(word)
return self._word_frequency.dictionary[word] / total_words
def correction(self, word: KeyT) -> typing.Optional[str]:
"""The most probable correct spelling for the word
Args:
word (str): The word to correct
Returns:
str: The most likely candidate or None if no correction is present"""
word = ensure_unicode(word)
candidates = self.candidates(word)
if not candidates:
return None
return max(sorted(list(candidates)), key=self.__getitem__)
def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
"""Generate possible spelling corrections for the provided word up to
an edit distance of two, if and only when needed
Args:
word (str): The word for which to calculate candidate spellings
Returns:
set: The set of words that are possible candidates or None if there are no candidates"""
word = ensure_unicode(word)
if self.known([word]): # short-cut if word is correct already
return {word}
if not self._check_if_should_check(word):
return {word}
# get edit distance 1...
res = list(self.edit_distance_1(word))
tmp = self.known(res)
if tmp:
return tmp
# if still not found, use the edit distance 1 to calc edit distance 2
if self._distance == 2:
tmp = self.known(list(self.__edit_distance_alt(res)))
if tmp:
return tmp
return None
def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
"""The subset of `words` that appear in the dictionary of words
Args:
words (list): List of words to determine which are in the corpus
Returns:
set: The set of those words from the input that are in the corpus"""
tmp_words = [ensure_unicode(w) for w in words]
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
"""The subset of `words` that do not appear in the dictionary
Args:
words (list): List of words to determine which are not in the corpus
Returns:
set: The set of those words from the input that are not in the corpus"""
tmp_words = [ensure_unicode(w) for w in words]
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
return {w for w in tmp if w not in self._word_frequency.dictionary}
def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
"""Compute all strings that are one edit away from `word` using only
the letters in the corpus
Args:
word (str): The word for which to calculate the edit distance
Returns:
set: The set of strings that are edit distance one from the provided word"""
tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
if self._check_if_should_check(tmp_word) is False:
return {tmp_word}
letters = self._word_frequency.letters
splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edit_distance_2(self, word: KeyT) -> typing.List[str]:
"""Compute all strings that are two edits away from `word` using only
the letters in the corpus
Args:
word (str): The word for which to calculate the edit distance
Returns:
set: The set of strings that are edit distance two from the provided word"""
word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
"""Compute all strings that are 1 edits away from all the words using
only the letters in the corpus
Args:
words (list): The words for which to calculate the edit distance
Returns:
set: The set of strings that are edit distance two from the provided words"""
tmp_words = [ensure_unicode(w) for w in words]
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
def _check_if_should_check(self, word: str) -> bool:
if len(word) == 1 and word in string.punctuation:
return False
if len(word) > self._word_frequency.longest_word_length + 3: # allow removal of up to 2 letters
return False
if word.lower() == "nan": # nan passes the float(word) so this will bypass that issue (#125)
return True
try: # check if it is a number (int, float, etc)
float(word)
return False
except ValueError:
pass
return True
class WordFrequency:
"""Store the `dictionary` as a word frequency list while allowing for
different methods to load the data and update over time"""
__slots__ = [
"_dictionary",
"_total_words",
"_unique_words",
"_letters",
"_tokenizer",
"_case_sensitive",
"_longest_word_length",
]
def __init__(self, tokenizer=None, case_sensitive=False):
self._dictionary = Counter()
self._total_words = 0
self._unique_words = 0
self._letters = set()
self._case_sensitive = case_sensitive
self._longest_word_length = 0
self._tokenizer = _parse_into_words
if tokenizer is not None:
self._tokenizer = tokenizer
def __contains__(self, key: KeyT) -> bool:
"""turn on contains"""
key = ensure_unicode(key)
key = key if self._case_sensitive else key.lower()
return key in self._dictionary
def __getitem__(self, key: KeyT) -> int:
"""turn on getitem"""
key = ensure_unicode(key)
key = key if self._case_sensitive else key.lower()
return self._dictionary[key]
def __iter__(self) -> typing.Generator[str, None, None]:
"""turn on iter support"""
yield from self._dictionary
def pop(self, key: KeyT, default: typing.Optional[int] = None) -> int:
"""Remove the key and return the associated value or default if not
found
Args:
key (str): The key to remove
default (obj): The value to return if key is not present"""
key = ensure_unicode(key)
return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
@property
def dictionary(self) -> typing.Dict[str, int]:
"""Counter: A counting dictionary of all words in the corpus and the number
of times each has been seen
Note:
Not settable"""
return self._dictionary
@property
def total_words(self) -> int:
"""int: The sum of all word occurances in the word frequency dictionary
Note:
Not settable"""
return self._total_words
@property
def unique_words(self) -> int:
"""int: The total number of unique words in the word frequency list
Note:
Not settable"""
return self._unique_words
@property
def letters(self) -> typing.Set[str]:
"""set: The listing of all letters found within the corpus
Note:
Not settable"""
return self._letters
@property
def longest_word_length(self) -> int:
"""int: The longest word length in the dictionary
Note:
Not settable"""
return self._longest_word_length
def tokenize(self, text: KeyT) -> typing.Generator[str, None, None]:
"""Tokenize the provided string object into individual words
Args:
text (str): The string object to tokenize
Yields:
str: The next `word` in the tokenized string
Note:
This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
tmp_text = ensure_unicode(text)
for word in self._tokenizer(tmp_text):
yield word if self._case_sensitive else word.lower()
def keys(self) -> typing.Generator[str, None, None]:
"""Iterator over the key of the dictionary
Yields:
str: The next key in the dictionary
Note:
This is the same as `spellchecker.words()`"""
yield from self._dictionary.keys()
def words(self) -> typing.Generator[str, None, None]:
"""Iterator over the words in the dictionary
Yields:
str: The next word in the dictionary
Note:
This is the same as `spellchecker.keys()`"""
yield from self._dictionary.keys()
def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
"""Iterator over the words in the dictionary
Yields:
str: The next word in the dictionary
int: The number of instances in the dictionary
Note:
This is the same as `dict.items()`"""
yield from self._dictionary.items()
def load_dictionary(self, filename: str, encoding: str = "utf-8") -> None:
"""Load in a pre-built word frequency list
Args:
filename (str): The filepath to the json (optionally gzipped) file to be loaded
encoding (str): The encoding of the dictionary"""
with load_file(filename, encoding) as data:
data = data if self._case_sensitive else data.lower()
self._dictionary.update(json.loads(data))
self._update_dictionary()
def load_json(self, data: typing.Dict[str, int]) -> None:
"""Load in a pre-built word frequency list
Args:
data (dict): The dictionary to be loaded"""
self._dictionary.update(data)
self._update_dictionary()
def load_text_file(
self,
filename: str,
encoding: str = "utf-8",
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
) -> None:
"""Load in a text file from which to generate a word frequency list
Args:
filename (str): The filepath to the text file to be loaded
encoding (str): The encoding of the text file
tokenizer (function): The function to use to tokenize a string
"""
with load_file(filename, encoding=encoding) as data:
self.load_text(data, tokenizer)
def load_text(
self,
text: KeyT,
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
) -> None:
"""Load text from which to generate a word frequency list
Args:
text (str): The text to be loaded
tokenizer (function): The function to use to tokenize a string
"""
text = ensure_unicode(text)
if tokenizer:
words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
else:
words = self.tokenize(text) # type: ignore
self._dictionary.update(words)
self._update_dictionary()
def load_words(self, words: typing.Iterable[KeyT]) -> None:
"""Load a list of words from which to generate a word frequency list
Args:
words (list): The list of words to be loaded"""
words = [ensure_unicode(w) for w in words]
self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
self._update_dictionary()
def add(self, word: KeyT, val: int = 1) -> None:
"""Add a word to the word frequency list
Args:
word (str): The word to add
val (int): The number of times to insert the word"""
word = ensure_unicode(word)
self.load_json({word if self._case_sensitive else word.lower(): val})
def remove_words(self, words: typing.Iterable[KeyT]) -> None:
"""Remove a list of words from the word frequency list
Args:
words (list): The list of words to remove"""
words = [ensure_unicode(w) for w in words]
for word in words:
self.pop(word)
self._update_dictionary()
def remove(self, word: KeyT) -> None:
"""Remove a word from the word frequency list
Args:
word (str): The word to remove"""
self.pop(word)
self._update_dictionary()
def remove_by_threshold(self, threshold: int = 5) -> None:
"""Remove all words at, or below, the provided threshold
Args:
threshold (int): The threshold at which a word is to be removed"""
to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
self.remove_words(to_remove)
def _update_dictionary(self) -> None:
"""Update the word frequency object"""
self._longest_word_length = 0
self._total_words = sum(self._dictionary.values())
self._unique_words = len(self._dictionary.keys())
self._letters = set()
for key in self._dictionary:
if len(key) > self._longest_word_length:
self._longest_word_length = len(key)
self._letters.update(key)