Coverage for src/debputy/lsp/spellchecking.py: 83%
175 statements
« prev ^ index » next coverage.py v7.6.0, created at 2025-01-27 13:59 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2025-01-27 13:59 +0000
1import functools
2import importlib.resources
3import itertools
4import os
5import re
6import subprocess
7from importlib.resources.abc import Traversable
8from typing import Iterable, FrozenSet, Tuple, Optional, Container, TYPE_CHECKING
10from debian.debian_support import Release
11from debputy.lsp.quickfixes import propose_correct_text_quick_fix
12from debputy.util import _info, _warn
13import debputy.lsp.data.wordlists as data_wordlist
15try:
16 from debputy.lsp.vendoring._deb822_repro.locatable import (
17 Position as TEPosition,
18 Range as TERange,
19 )
20except ImportError:
21 pass
23if TYPE_CHECKING:
24 from debputy.linting.lint_util import LintState
27_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"
28_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"
29_WORD_PARTS = re.compile(r"(\S+)")
30_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")
31_FIND_QUOTE_CHAR = re.compile(r'["`]')
32_LOOKS_LIKE_FILENAME = re.compile(
33 r"""
34 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*
35 | [a-z0-9-_]+(/[a-z0-9]+)+/*
36 | [a-z0-9_]+(/[a-z0-9_]+){2,}/*
37 | (?:\S+)?[.][a-z]{1,3}
39""",
40 re.VERBOSE,
41)
42_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(
43 r"""
44 (
45 # Java identifier Camel Case
46 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+
47 # Type name Camel Case
48 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+
49 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other
50 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+
51 # Perl module
52 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+
53 # Probably an abbreviation
54 | [A-Z]{3,}
55 # Perl/Python identifiers or Jinja templates
56 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?
57 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)
58 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?
59 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#
60 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words
61 | [a-z][a-z0-9]*(-[a-z0-9]+){2,}
62 # Short args
63 | -[a-z0-9]+
64 # Things like 32bit
65 | \d{2,}-?[a-z]+
66 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)
67 | src:[a-z0-9][-+.a-z0-9]+
68 | [a-z0-9][-+.a-z0-9]+:(?:any|native)
69 # Version
70 | v\d+(?:[.]\S+)?
71 # chmod symbolic mode or math
72 | \S*=\S+
73 )
74""",
75 re.VERBOSE,
76)
77_LOOKS_LIKE_EMAIL = re.compile(
78 r"""
79 <[^>@\s]+@[^>@\s]+>
80""",
81 re.VERBOSE,
82)
83_NO_CORRECTIONS = tuple()
84_WORDLISTS = [
85 "debian-wordlist.dic",
86]
87_NAMELISTS = [
88 "logins-and-people.dic",
89]
90_PERSONAL_DICTS = [
91 "${HOME}/.hunspell_default",
92 "${HOME}/.hunspell_en_US",
93]
96try:
97 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was never true
98 _SPELL_CHECKER_AFF
99 ):
100 raise ImportError
101 from hunspell import HunSpell
103 _HAS_HUNSPELL = True
104except ImportError:
105 _HAS_HUNSPELL = False
108def _read_wordlist(
109 base_dir: Traversable,
110 wordlist_name: str,
111 *,
112 namelist: bool = False,
113) -> Iterable[str]:
114 path = base_dir.joinpath(wordlist_name)
115 with path.open("r", encoding="utf-8") as fd:
116 w = [w.strip() for w in fd]
117 yield from w
118 if namelist:
119 yield from (f"{n}'s" for n in w)
122def _all_debian_archs() -> Iterable[str]:
123 try:
124 output = subprocess.check_output(["dpkg-architecture", "-L"])
125 except (FileNotFoundError, subprocess.CalledProcessError) as e:
126 _warn(f"dpkg-architecture -L failed: {e}")
127 return tuple()
129 return (x.strip() for x in output.decode("utf-8").splitlines())
132@functools.lru_cache
133def _builtin_exception_words() -> FrozenSet[str]:
134 release_names = (x for x in Release.releases)
135 data_dir = importlib.resources.files(data_wordlist.__name__)
136 return frozenset(
137 itertools.chain(
138 itertools.chain.from_iterable(
139 _read_wordlist(data_dir, wl) for wl in _WORDLISTS
140 ),
141 itertools.chain.from_iterable(
142 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS
143 ),
144 release_names,
145 _all_debian_archs(),
146 )
147 )
150_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None
153def spellcheck_line(
154 lint_state: "LintState",
155 line_no: int,
156 line: str,
157) -> int:
158 spell_checker = lint_state.spellchecker()
159 typos = 0
160 for word, pos, endpos in spell_checker.iter_words(line):
161 corrections = spell_checker.provide_corrections_for(word)
162 if not corrections: 162 ↛ 164line 162 didn't jump to line 164 because the condition on line 162 was always true
163 continue
164 word_range = TERange(
165 TEPosition(line_no, pos),
166 TEPosition(line_no, endpos),
167 )
168 typos += 1
169 lint_state.emit_diagnostic(
170 word_range,
171 f'Spelling "{word}"',
172 "spelling",
173 "debputy",
174 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections],
175 enable_non_interactive_auto_fix=False,
176 )
177 return typos
180def default_spellchecker() -> "Spellchecker":
181 global _DEFAULT_SPELL_CHECKER
182 spellchecker = _DEFAULT_SPELL_CHECKER
183 if spellchecker is None: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true
184 if _HAS_HUNSPELL:
185 spellchecker = HunspellSpellchecker()
186 else:
187 spellchecker = _do_nothing_spellchecker()
188 _DEFAULT_SPELL_CHECKER = spellchecker
189 return spellchecker
192@functools.lru_cache()
193def _do_nothing_spellchecker() -> "Spellchecker":
194 return EverythingIsCorrectSpellchecker()
197def disable_spellchecking() -> None:
198 global _DEFAULT_SPELL_CHECKER
199 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()
202def _testing_set_default_spellchecker(
203 spellchecker: Optional["Spellchecker"],
204) -> Optional["Spellchecker"]:
205 global _DEFAULT_SPELL_CHECKER
206 previous = _DEFAULT_SPELL_CHECKER
207 _DEFAULT_SPELL_CHECKER = spellchecker
208 return previous
211def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]:
212 current_pos = 0
213 while True:
214 try:
215 m = _FIND_QUOTE_CHAR.search(line, current_pos)
216 if m is None:
217 if current_pos == 0:
218 yield line, 0
219 else:
220 yield line[current_pos:], current_pos
221 return
222 starting_marker_pos = m.span()[0]
223 quote_char = m.group()
224 end_marker_pos = line.index(quote_char, starting_marker_pos + 1)
225 except ValueError:
226 yield line[current_pos:], current_pos
227 return
229 part = line[current_pos:starting_marker_pos]
231 if not part.isspace(): 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true
232 yield part, current_pos
233 current_pos = end_marker_pos + 1
236def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]:
237 for line_part, part_pos in _skip_quoted_parts(line):
238 for m in _WORD_PARTS.finditer(line_part):
239 fullword = m.group(1)
240 if fullword.startswith("--"): 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was never true
241 # CLI arg
242 continue
243 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword):
244 continue
245 if _LOOKS_LIKE_FILENAME.match(fullword):
246 continue
247 if _LOOKS_LIKE_EMAIL.match(fullword): 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true
248 continue
249 mpos = m.span(1)[0]
250 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):
251 pos, endpos = sm.span(1)
252 offset = part_pos + mpos
253 yield sm.group(1), pos + offset, endpos + offset
256class Spellchecker:
258 @staticmethod
259 def do_nothing_spellchecker() -> "Spellchecker":
260 return EverythingIsCorrectSpellchecker()
262 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:
263 yield from _split_line_to_words(line)
265 def provide_corrections_for(self, word: str) -> Iterable[str]:
266 raise NotImplementedError
268 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":
269 if not words: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 return self
271 return ContextIgnoredWordsSpellchecker(self, words)
274class ContextIgnoredWordsSpellchecker(Spellchecker):
276 def __init__(
277 self, spellchecker: Spellchecker, context_ignored_words: Container[str]
278 ) -> None:
279 self._spellchecker = spellchecker
280 self._context_ignored_words = context_ignored_words
282 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:
283 return self._spellchecker.iter_words(line)
285 def provide_corrections_for(self, word: str) -> Iterable[str]:
286 if word.lower() in self._context_ignored_words:
287 return _NO_CORRECTIONS
288 return self._spellchecker.provide_corrections_for(word)
291class EverythingIsCorrectSpellchecker(Spellchecker):
292 def provide_corrections_for(self, word: str) -> Iterable[str]:
293 return _NO_CORRECTIONS
295 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":
296 return self
299class HunspellSpellchecker(Spellchecker):
301 def __init__(self) -> None:
302 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)
303 for w in _builtin_exception_words():
304 self._checker.add(w)
305 self._load_personal_exclusions()
307 def provide_corrections_for(self, word: str) -> Iterable[str]:
308 if word.startswith( 308 ↛ 320line 308 didn't jump to line 320 because the condition on line 308 was never true
309 (
310 "dpkg-",
311 "dh-",
312 "dh_",
313 "debian-",
314 "debconf-",
315 "update-",
316 "DEB_",
317 "DPKG_",
318 )
319 ):
320 return _NO_CORRECTIONS
321 # 'ing is deliberately forcing a word into another word-class
322 if word.endswith(("'ing", "-nss")): 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true
323 return _NO_CORRECTIONS
324 return self._lookup(word)
326 @functools.lru_cache(128)
327 def _lookup(self, word: str) -> Iterable[str]:
328 if self._checker.spell(word):
329 return _NO_CORRECTIONS
330 return self._checker.suggest(word)
332 def _load_personal_exclusions(self) -> None:
333 for filename in _PERSONAL_DICTS:
334 if filename.startswith("${"): 334 ↛ 341line 334 didn't jump to line 341 because the condition on line 334 was always true
335 end_index = filename.index("}")
336 varname = filename[2:end_index]
337 value = os.environ.get(varname)
338 if value is None: 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true
339 continue
340 filename = value + filename[end_index + 1 :]
341 if os.path.isfile(filename): 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true
342 _info(f"Loading personal spelling dictionary from {filename}")
343 self._checker.add_dic(filename)