513 lines
19 KiB
Python
513 lines
19 KiB
Python
""" SpellChecker Module; simple, intuitive spell checker based on the post by
|
|
Peter Norvig. See: https://norvig.com/spell-correct.html """
|
|
import gzip
|
|
import json
|
|
import pkgutil
|
|
import string
|
|
import typing
|
|
from collections import Counter
|
|
from collections.abc import Iterable
|
|
|
|
from .utils import KeyT, _parse_into_words, ensure_unicode, load_file, write_file
|
|
|
|
|
|
class SpellChecker:
|
|
"""The SpellChecker class encapsulates the basics needed to accomplish a
|
|
simple spell checking algorithm. It is based on the work by
|
|
Peter Norvig (https://norvig.com/spell-correct.html)
|
|
|
|
Args:
|
|
language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
|
|
`en`, `es`, `de`, `fr`, `pt`, `ru`, `lv`, and `eu`. Defaults to `en`. A list of languages may be provided and all \
|
|
languages will be loaded.
|
|
local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
|
|
will be loaded
|
|
distance (int): The edit distance to use. Defaults to 2.
|
|
case_sensitive (bool): Flag to use a case sensitive dictionary or not, only available when not using a \
|
|
language dictionary.
|
|
Note:
|
|
Using a case sensitive dictionary can be slow to correct words."""
|
|
|
|
__slots__ = ["_distance", "_word_frequency", "_tokenizer", "_case_sensitive"]
|
|
|
|
def __init__(
|
|
self,
|
|
language: typing.Union[str, typing.Iterable[str]] = "en",
|
|
local_dictionary: typing.Optional[str] = None,
|
|
distance: int = 2,
|
|
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
case_sensitive: bool = False,
|
|
) -> None:
|
|
self._distance = 2 # default
|
|
self.distance = distance # use the setter value check
|
|
|
|
if tokenizer:
|
|
self._tokenizer = tokenizer
|
|
else:
|
|
self._tokenizer = _parse_into_words
|
|
|
|
self._case_sensitive = case_sensitive if not language else False
|
|
self._word_frequency = WordFrequency(self._tokenizer, self._case_sensitive)
|
|
|
|
if local_dictionary:
|
|
self._word_frequency.load_dictionary(local_dictionary)
|
|
elif language:
|
|
if not isinstance(language, Iterable) or isinstance(language, (str, bytes)):
|
|
language = [language] # type: ignore
|
|
for lang in language:
|
|
filename = f"resources/{lang.lower()}.json.gz"
|
|
try:
|
|
json_open = pkgutil.get_data("spellchecker", filename)
|
|
except FileNotFoundError as exc:
|
|
msg = f"The provided dictionary language ({lang.lower()}) does not exist!"
|
|
raise ValueError(msg) from exc
|
|
if json_open:
|
|
lang_dict = json.loads(gzip.decompress(json_open).decode("utf-8"))
|
|
self._word_frequency.load_json(lang_dict)
|
|
|
|
def __contains__(self, key: KeyT) -> bool:
|
|
"""setup easier known checks"""
|
|
key = ensure_unicode(key)
|
|
return key in self._word_frequency
|
|
|
|
def __getitem__(self, key: KeyT) -> int:
|
|
"""setup easier frequency checks"""
|
|
key = ensure_unicode(key)
|
|
return self._word_frequency[key]
|
|
|
|
def __iter__(self) -> typing.Generator[str, None, None]:
|
|
"""setup iter support"""
|
|
yield from self._word_frequency.dictionary
|
|
|
|
@classmethod
|
|
def languages(cls) -> typing.Iterable[str]:
|
|
"""list: A list of all official languages supported by the library"""
|
|
return ["de", "en", "es", "fr", "pt", "ru", "ar", "lv", "eu"]
|
|
|
|
@property
|
|
def word_frequency(self) -> "WordFrequency":
|
|
"""WordFrequency: An encapsulation of the word frequency `dictionary`
|
|
|
|
Note:
|
|
Not settable"""
|
|
return self._word_frequency
|
|
|
|
@property
|
|
def distance(self) -> int:
|
|
"""int: The maximum edit distance to calculate
|
|
|
|
Note:
|
|
Valid values are 1 or 2; if an invalid value is passed, defaults to 2"""
|
|
return self._distance
|
|
|
|
@distance.setter
|
|
def distance(self, val: int) -> None:
|
|
"""set the distance parameter"""
|
|
tmp = 2
|
|
try:
|
|
if 0 < int(val) <= 2:
|
|
tmp = val
|
|
except (ValueError, TypeError):
|
|
pass
|
|
self._distance = tmp
|
|
|
|
def split_words(self, text: KeyT) -> typing.Iterable[str]:
|
|
"""Split text into individual `words` using either a simple whitespace
|
|
regex or the passed in tokenizer
|
|
|
|
Args:
|
|
text (str): The text to split into individual words
|
|
Returns:
|
|
list(str): A listing of all words in the provided text"""
|
|
text = ensure_unicode(text)
|
|
return self._tokenizer(text)
|
|
|
|
def export(self, filepath: str, encoding: str = "utf-8", gzipped: bool = True) -> None:
|
|
"""Export the word frequency list for import in the future
|
|
|
|
Args:
|
|
filepath (str): The filepath to the exported dictionary
|
|
encoding (str): The encoding of the resulting output
|
|
gzipped (bool): Whether to gzip the dictionary or not"""
|
|
data = json.dumps(self.word_frequency.dictionary, sort_keys=True)
|
|
write_file(filepath, encoding, gzipped, data)
|
|
|
|
def word_usage_frequency(self, word: KeyT, total_words: typing.Optional[int] = None) -> float:
|
|
"""Calculate the frequency to the `word` provided as seen across the
|
|
entire dictionary
|
|
|
|
Args:
|
|
word (str): The word for which the word probability is calculated
|
|
total_words (int): The total number of words to use in the calculation; \
|
|
use the default for using the whole word frequency
|
|
Returns:
|
|
float: The probability that the word is the correct word"""
|
|
if not total_words:
|
|
total_words = self._word_frequency.total_words
|
|
word = ensure_unicode(word)
|
|
return self._word_frequency.dictionary[word] / total_words
|
|
|
|
def correction(self, word: KeyT) -> typing.Optional[str]:
|
|
"""The most probable correct spelling for the word
|
|
|
|
Args:
|
|
word (str): The word to correct
|
|
Returns:
|
|
str: The most likely candidate or None if no correction is present"""
|
|
word = ensure_unicode(word)
|
|
candidates = self.candidates(word)
|
|
if not candidates:
|
|
return None
|
|
return max(sorted(list(candidates)), key=self.__getitem__)
|
|
|
|
def candidates(self, word: KeyT) -> typing.Optional[typing.Set[str]]:
|
|
"""Generate possible spelling corrections for the provided word up to
|
|
an edit distance of two, if and only when needed
|
|
|
|
Args:
|
|
word (str): The word for which to calculate candidate spellings
|
|
Returns:
|
|
set: The set of words that are possible candidates or None if there are no candidates"""
|
|
word = ensure_unicode(word)
|
|
if self.known([word]): # short-cut if word is correct already
|
|
return {word}
|
|
|
|
if not self._check_if_should_check(word):
|
|
return {word}
|
|
|
|
# get edit distance 1...
|
|
res = list(self.edit_distance_1(word))
|
|
tmp = self.known(res)
|
|
if tmp:
|
|
return tmp
|
|
# if still not found, use the edit distance 1 to calc edit distance 2
|
|
if self._distance == 2:
|
|
tmp = self.known(list(self.__edit_distance_alt(res)))
|
|
if tmp:
|
|
return tmp
|
|
return None
|
|
|
|
def known(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
|
|
"""The subset of `words` that appear in the dictionary of words
|
|
|
|
Args:
|
|
words (list): List of words to determine which are in the corpus
|
|
Returns:
|
|
set: The set of those words from the input that are in the corpus"""
|
|
tmp_words = [ensure_unicode(w) for w in words]
|
|
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words]
|
|
return {w for w in tmp if w in self._word_frequency.dictionary and self._check_if_should_check(w)}
|
|
|
|
def unknown(self, words: typing.Iterable[KeyT]) -> typing.Set[str]:
|
|
"""The subset of `words` that do not appear in the dictionary
|
|
|
|
Args:
|
|
words (list): List of words to determine which are not in the corpus
|
|
Returns:
|
|
set: The set of those words from the input that are not in the corpus"""
|
|
tmp_words = [ensure_unicode(w) for w in words]
|
|
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
|
|
return {w for w in tmp if w not in self._word_frequency.dictionary}
|
|
|
|
def edit_distance_1(self, word: KeyT) -> typing.Set[str]:
|
|
"""Compute all strings that are one edit away from `word` using only
|
|
the letters in the corpus
|
|
|
|
Args:
|
|
word (str): The word for which to calculate the edit distance
|
|
Returns:
|
|
set: The set of strings that are edit distance one from the provided word"""
|
|
tmp_word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
|
|
if self._check_if_should_check(tmp_word) is False:
|
|
return {tmp_word}
|
|
letters = self._word_frequency.letters
|
|
splits = [(tmp_word[:i], tmp_word[i:]) for i in range(len(tmp_word) + 1)]
|
|
deletes = [L + R[1:] for L, R in splits if R]
|
|
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
|
|
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
|
inserts = [L + c + R for L, R in splits for c in letters]
|
|
return set(deletes + transposes + replaces + inserts)
|
|
|
|
def edit_distance_2(self, word: KeyT) -> typing.List[str]:
|
|
"""Compute all strings that are two edits away from `word` using only
|
|
the letters in the corpus
|
|
|
|
Args:
|
|
word (str): The word for which to calculate the edit distance
|
|
Returns:
|
|
set: The set of strings that are edit distance two from the provided word"""
|
|
word = ensure_unicode(word).lower() if not self._case_sensitive else ensure_unicode(word)
|
|
return [e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)]
|
|
|
|
def __edit_distance_alt(self, words: typing.Iterable[KeyT]) -> typing.List[str]:
|
|
"""Compute all strings that are 1 edits away from all the words using
|
|
only the letters in the corpus
|
|
|
|
Args:
|
|
words (list): The words for which to calculate the edit distance
|
|
Returns:
|
|
set: The set of strings that are edit distance two from the provided words"""
|
|
tmp_words = [ensure_unicode(w) for w in words]
|
|
tmp = [w if self._case_sensitive else w.lower() for w in tmp_words if self._check_if_should_check(w)]
|
|
return [e2 for e1 in tmp for e2 in self.known(self.edit_distance_1(e1))]
|
|
|
|
def _check_if_should_check(self, word: str) -> bool:
|
|
if len(word) == 1 and word in string.punctuation:
|
|
return False
|
|
if len(word) > self._word_frequency.longest_word_length + 3: # allow removal of up to 2 letters
|
|
return False
|
|
if word.lower() == "nan": # nan passes the float(word) so this will bypass that issue (#125)
|
|
return True
|
|
try: # check if it is a number (int, float, etc)
|
|
float(word)
|
|
return False
|
|
except ValueError:
|
|
pass
|
|
|
|
return True
|
|
|
|
|
|
class WordFrequency:
|
|
"""Store the `dictionary` as a word frequency list while allowing for
|
|
different methods to load the data and update over time"""
|
|
|
|
__slots__ = [
|
|
"_dictionary",
|
|
"_total_words",
|
|
"_unique_words",
|
|
"_letters",
|
|
"_tokenizer",
|
|
"_case_sensitive",
|
|
"_longest_word_length",
|
|
]
|
|
|
|
def __init__(self, tokenizer=None, case_sensitive=False):
|
|
self._dictionary = Counter()
|
|
self._total_words = 0
|
|
self._unique_words = 0
|
|
self._letters = set()
|
|
self._case_sensitive = case_sensitive
|
|
self._longest_word_length = 0
|
|
|
|
self._tokenizer = _parse_into_words
|
|
if tokenizer is not None:
|
|
self._tokenizer = tokenizer
|
|
|
|
def __contains__(self, key: KeyT) -> bool:
|
|
"""turn on contains"""
|
|
key = ensure_unicode(key)
|
|
key = key if self._case_sensitive else key.lower()
|
|
return key in self._dictionary
|
|
|
|
def __getitem__(self, key: KeyT) -> int:
|
|
"""turn on getitem"""
|
|
key = ensure_unicode(key)
|
|
key = key if self._case_sensitive else key.lower()
|
|
return self._dictionary[key]
|
|
|
|
def __iter__(self) -> typing.Generator[str, None, None]:
|
|
"""turn on iter support"""
|
|
yield from self._dictionary
|
|
|
|
def pop(self, key: KeyT, default: typing.Optional[int] = None) -> int:
|
|
"""Remove the key and return the associated value or default if not
|
|
found
|
|
|
|
Args:
|
|
key (str): The key to remove
|
|
default (obj): The value to return if key is not present"""
|
|
key = ensure_unicode(key)
|
|
return self._dictionary.pop(key if self._case_sensitive else key.lower(), default)
|
|
|
|
@property
|
|
def dictionary(self) -> typing.Dict[str, int]:
|
|
"""Counter: A counting dictionary of all words in the corpus and the number
|
|
of times each has been seen
|
|
|
|
Note:
|
|
Not settable"""
|
|
return self._dictionary
|
|
|
|
@property
|
|
def total_words(self) -> int:
|
|
"""int: The sum of all word occurances in the word frequency dictionary
|
|
|
|
Note:
|
|
Not settable"""
|
|
return self._total_words
|
|
|
|
@property
|
|
def unique_words(self) -> int:
|
|
"""int: The total number of unique words in the word frequency list
|
|
|
|
Note:
|
|
Not settable"""
|
|
return self._unique_words
|
|
|
|
@property
|
|
def letters(self) -> typing.Set[str]:
|
|
"""set: The listing of all letters found within the corpus
|
|
|
|
Note:
|
|
Not settable"""
|
|
return self._letters
|
|
|
|
@property
|
|
def longest_word_length(self) -> int:
|
|
"""int: The longest word length in the dictionary
|
|
|
|
Note:
|
|
Not settable"""
|
|
return self._longest_word_length
|
|
|
|
def tokenize(self, text: KeyT) -> typing.Generator[str, None, None]:
|
|
"""Tokenize the provided string object into individual words
|
|
|
|
Args:
|
|
text (str): The string object to tokenize
|
|
Yields:
|
|
str: The next `word` in the tokenized string
|
|
Note:
|
|
This is the same as the `spellchecker.split_words()` unless a tokenizer function was provided."""
|
|
tmp_text = ensure_unicode(text)
|
|
for word in self._tokenizer(tmp_text):
|
|
yield word if self._case_sensitive else word.lower()
|
|
|
|
def keys(self) -> typing.Generator[str, None, None]:
|
|
"""Iterator over the key of the dictionary
|
|
|
|
Yields:
|
|
str: The next key in the dictionary
|
|
Note:
|
|
This is the same as `spellchecker.words()`"""
|
|
yield from self._dictionary.keys()
|
|
|
|
def words(self) -> typing.Generator[str, None, None]:
|
|
"""Iterator over the words in the dictionary
|
|
|
|
Yields:
|
|
str: The next word in the dictionary
|
|
Note:
|
|
This is the same as `spellchecker.keys()`"""
|
|
yield from self._dictionary.keys()
|
|
|
|
def items(self) -> typing.Generator[typing.Tuple[str, int], None, None]:
|
|
"""Iterator over the words in the dictionary
|
|
|
|
Yields:
|
|
str: The next word in the dictionary
|
|
int: The number of instances in the dictionary
|
|
Note:
|
|
This is the same as `dict.items()`"""
|
|
yield from self._dictionary.items()
|
|
|
|
def load_dictionary(self, filename: str, encoding: str = "utf-8") -> None:
|
|
"""Load in a pre-built word frequency list
|
|
|
|
Args:
|
|
filename (str): The filepath to the json (optionally gzipped) file to be loaded
|
|
encoding (str): The encoding of the dictionary"""
|
|
with load_file(filename, encoding) as data:
|
|
data = data if self._case_sensitive else data.lower()
|
|
self._dictionary.update(json.loads(data))
|
|
self._update_dictionary()
|
|
|
|
def load_json(self, data: typing.Dict[str, int]) -> None:
|
|
"""Load in a pre-built word frequency list
|
|
|
|
Args:
|
|
data (dict): The dictionary to be loaded"""
|
|
self._dictionary.update(data)
|
|
self._update_dictionary()
|
|
|
|
def load_text_file(
|
|
self,
|
|
filename: str,
|
|
encoding: str = "utf-8",
|
|
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
) -> None:
|
|
"""Load in a text file from which to generate a word frequency list
|
|
|
|
Args:
|
|
filename (str): The filepath to the text file to be loaded
|
|
encoding (str): The encoding of the text file
|
|
tokenizer (function): The function to use to tokenize a string
|
|
"""
|
|
with load_file(filename, encoding=encoding) as data:
|
|
self.load_text(data, tokenizer)
|
|
|
|
def load_text(
|
|
self,
|
|
text: KeyT,
|
|
tokenizer: typing.Optional[typing.Callable[[str], typing.Iterable[str]]] = None,
|
|
) -> None:
|
|
"""Load text from which to generate a word frequency list
|
|
|
|
Args:
|
|
text (str): The text to be loaded
|
|
tokenizer (function): The function to use to tokenize a string
|
|
"""
|
|
text = ensure_unicode(text)
|
|
if tokenizer:
|
|
words = [x if self._case_sensitive else x.lower() for x in tokenizer(text)]
|
|
else:
|
|
words = self.tokenize(text) # type: ignore
|
|
|
|
self._dictionary.update(words)
|
|
self._update_dictionary()
|
|
|
|
def load_words(self, words: typing.Iterable[KeyT]) -> None:
|
|
"""Load a list of words from which to generate a word frequency list
|
|
|
|
Args:
|
|
words (list): The list of words to be loaded"""
|
|
words = [ensure_unicode(w) for w in words]
|
|
self._dictionary.update([word if self._case_sensitive else word.lower() for word in words])
|
|
self._update_dictionary()
|
|
|
|
def add(self, word: KeyT, val: int = 1) -> None:
|
|
"""Add a word to the word frequency list
|
|
|
|
Args:
|
|
word (str): The word to add
|
|
val (int): The number of times to insert the word"""
|
|
word = ensure_unicode(word)
|
|
self.load_json({word if self._case_sensitive else word.lower(): val})
|
|
|
|
def remove_words(self, words: typing.Iterable[KeyT]) -> None:
|
|
"""Remove a list of words from the word frequency list
|
|
|
|
Args:
|
|
words (list): The list of words to remove"""
|
|
words = [ensure_unicode(w) for w in words]
|
|
for word in words:
|
|
self.pop(word)
|
|
self._update_dictionary()
|
|
|
|
def remove(self, word: KeyT) -> None:
|
|
"""Remove a word from the word frequency list
|
|
|
|
Args:
|
|
word (str): The word to remove"""
|
|
self.pop(word)
|
|
self._update_dictionary()
|
|
|
|
def remove_by_threshold(self, threshold: int = 5) -> None:
|
|
"""Remove all words at, or below, the provided threshold
|
|
|
|
Args:
|
|
threshold (int): The threshold at which a word is to be removed"""
|
|
to_remove = [k for k, v in self._dictionary.items() if v <= threshold]
|
|
self.remove_words(to_remove)
|
|
|
|
def _update_dictionary(self) -> None:
|
|
"""Update the word frequency object"""
|
|
self._longest_word_length = 0
|
|
self._total_words = sum(self._dictionary.values())
|
|
self._unique_words = len(self._dictionary.keys())
|
|
self._letters = set()
|
|
for key in self._dictionary:
|
|
if len(key) > self._longest_word_length:
|
|
self._longest_word_length = len(key)
|
|
self._letters.update(key)
|