200 lines
3.4 KiB
Python
200 lines
3.4 KiB
Python
|
"""French search language: includes the JS French stemmer."""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
from typing import TYPE_CHECKING, Dict
|
||
|
|
||
|
import snowballstemmer
|
||
|
|
||
|
from sphinx.search import SearchLanguage, parse_stop_word
|
||
|
|
||
|
french_stopwords = parse_stop_word('''
|
||
|
| source: http://snowball.tartarus.org/algorithms/french/stop.txt
|
||
|
au | a + le
|
||
|
aux | a + les
|
||
|
avec | with
|
||
|
ce | this
|
||
|
ces | these
|
||
|
dans | with
|
||
|
de | of
|
||
|
des | de + les
|
||
|
du | de + le
|
||
|
elle | she
|
||
|
en | `of them' etc
|
||
|
et | and
|
||
|
eux | them
|
||
|
il | he
|
||
|
je | I
|
||
|
la | the
|
||
|
le | the
|
||
|
leur | their
|
||
|
lui | him
|
||
|
ma | my (fem)
|
||
|
mais | but
|
||
|
me | me
|
||
|
même | same; as in moi-même (myself) etc
|
||
|
mes | me (pl)
|
||
|
moi | me
|
||
|
mon | my (masc)
|
||
|
ne | not
|
||
|
nos | our (pl)
|
||
|
notre | our
|
||
|
nous | we
|
||
|
on | one
|
||
|
ou | where
|
||
|
par | by
|
||
|
pas | not
|
||
|
pour | for
|
||
|
qu | que before vowel
|
||
|
que | that
|
||
|
qui | who
|
||
|
sa | his, her (fem)
|
||
|
se | oneself
|
||
|
ses | his (pl)
|
||
|
son | his, her (masc)
|
||
|
sur | on
|
||
|
ta | thy (fem)
|
||
|
te | thee
|
||
|
tes | thy (pl)
|
||
|
toi | thee
|
||
|
ton | thy (masc)
|
||
|
tu | thou
|
||
|
un | a
|
||
|
une | a
|
||
|
vos | your (pl)
|
||
|
votre | your
|
||
|
vous | you
|
||
|
|
||
|
| single letter forms
|
||
|
|
||
|
c | c'
|
||
|
d | d'
|
||
|
j | j'
|
||
|
l | l'
|
||
|
à | to, at
|
||
|
m | m'
|
||
|
n | n'
|
||
|
s | s'
|
||
|
t | t'
|
||
|
y | there
|
||
|
|
||
|
| forms of être (not including the infinitive):
|
||
|
été
|
||
|
étée
|
||
|
étées
|
||
|
étés
|
||
|
étant
|
||
|
suis
|
||
|
es
|
||
|
est
|
||
|
sommes
|
||
|
êtes
|
||
|
sont
|
||
|
serai
|
||
|
seras
|
||
|
sera
|
||
|
serons
|
||
|
serez
|
||
|
seront
|
||
|
serais
|
||
|
serait
|
||
|
serions
|
||
|
seriez
|
||
|
seraient
|
||
|
étais
|
||
|
était
|
||
|
étions
|
||
|
étiez
|
||
|
étaient
|
||
|
fus
|
||
|
fut
|
||
|
fûmes
|
||
|
fûtes
|
||
|
furent
|
||
|
sois
|
||
|
soit
|
||
|
soyons
|
||
|
soyez
|
||
|
soient
|
||
|
fusse
|
||
|
fusses
|
||
|
fût
|
||
|
fussions
|
||
|
fussiez
|
||
|
fussent
|
||
|
|
||
|
| forms of avoir (not including the infinitive):
|
||
|
ayant
|
||
|
eu
|
||
|
eue
|
||
|
eues
|
||
|
eus
|
||
|
ai
|
||
|
as
|
||
|
avons
|
||
|
avez
|
||
|
ont
|
||
|
aurai
|
||
|
auras
|
||
|
aura
|
||
|
aurons
|
||
|
aurez
|
||
|
auront
|
||
|
aurais
|
||
|
aurait
|
||
|
aurions
|
||
|
auriez
|
||
|
auraient
|
||
|
avais
|
||
|
avait
|
||
|
avions
|
||
|
aviez
|
||
|
avaient
|
||
|
eut
|
||
|
eûmes
|
||
|
eûtes
|
||
|
eurent
|
||
|
aie
|
||
|
aies
|
||
|
ait
|
||
|
ayons
|
||
|
ayez
|
||
|
aient
|
||
|
eusse
|
||
|
eusses
|
||
|
eût
|
||
|
eussions
|
||
|
eussiez
|
||
|
eussent
|
||
|
|
||
|
| Later additions (from Jean-Christophe Deschamps)
|
||
|
ceci | this
|
||
|
cela | that (added 11 Apr 2012. Omission reported by Adrien Grand)
|
||
|
celà | that (incorrect, though common)
|
||
|
cet | this
|
||
|
cette | this
|
||
|
ici | here
|
||
|
ils | they
|
||
|
les | the (pl)
|
||
|
leurs | their (pl)
|
||
|
quel | which
|
||
|
quels | which
|
||
|
quelle | which
|
||
|
quelles | which
|
||
|
sans | without
|
||
|
soi | oneself
|
||
|
''')
|
||
|
|
||
|
|
||
|
class SearchFrench(SearchLanguage):
|
||
|
lang = 'fr'
|
||
|
language_name = 'French'
|
||
|
js_stemmer_rawcode = 'french-stemmer.js'
|
||
|
stopwords = french_stopwords
|
||
|
|
||
|
def init(self, options: dict) -> None:
|
||
|
self.stemmer = snowballstemmer.stemmer('french')
|
||
|
|
||
|
def stem(self, word: str) -> str:
|
||
|
return self.stemmer.stemWord(word.lower())
|