Coverage for src/debputy/lsp/spellchecking.py: 82%
176 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-10-12 15:06 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-10-12 15:06 +0000
1import functools
2import importlib.resources
3import itertools
4import os
5import re
6import subprocess
7from importlib.resources.abc import Traversable
8from typing import FrozenSet, Tuple, Optional, TYPE_CHECKING
9from collections.abc import Iterable, Container
11from debian.debian_support import Release
12from debputy.lsp.quickfixes import propose_correct_text_quick_fix
13from debputy.util import _info, _warn
14import debputy.lsp.data.wordlists as data_wordlist
16try:
17 from debputy.lsp.vendoring._deb822_repro.locatable import (
18 Position as TEPosition,
19 Range as TERange,
20 )
21except ImportError:
22 pass
24if TYPE_CHECKING:
25 from debputy.linting.lint_util import LintState
28_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"
29_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"
30_WORD_PARTS = re.compile(r"(\S+)")
31_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")
32_FIND_QUOTE_CHAR = re.compile(r'["`]')
33_LOOKS_LIKE_FILENAME = re.compile(
34 r"""
35 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*
36 | [a-z0-9-_]+(/[a-z0-9]+)+/*
37 | [a-z0-9_]+(/[a-z0-9_]+){2,}/*
38 | (?:\S+)?[.][a-z]{1,3}
40""",
41 re.VERBOSE,
42)
43_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(
44 r"""
45 (
46 # Java identifier Camel Case
47 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+
48 # Type name Camel Case
49 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+
50 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other
51 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+
52 # Perl module
53 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+
54 # Probably an abbreviation
55 | [A-Z]{3,}
56 # Perl/Python identifiers or Jinja templates
57 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?
58 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)
59 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?
60 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#
61 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words
62 | [a-z][a-z0-9]*(-[a-z0-9]+){2,}
63 # Short args
64 | -[a-z0-9]+
65 # Things like 32bit
66 | \d{2,}-?[a-z]+
67 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)
68 | src:[a-z0-9][-+.a-z0-9]+
69 | [a-z0-9][-+.a-z0-9]+:(?:any|native)
70 # Version
71 | v\d+(?:[.]\S+)?
72 # chmod symbolic mode or math
73 | \S*=\S+
74 )
75""",
76 re.VERBOSE,
77)
78_LOOKS_LIKE_EMAIL = re.compile(
79 r"""
80 <[^>@\s]+@[^>@\s]+>
81""",
82 re.VERBOSE,
83)
84_NO_CORRECTIONS = tuple()
85_WORDLISTS = [
86 "debian-wordlist.dic",
87]
88_NAMELISTS = [
89 "logins-and-people.dic",
90]
91_PERSONAL_DICTS = [
92 "${HOME}/.hunspell_default",
93 "${HOME}/.hunspell_en_US",
94]
97try:
98 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was never true
99 _SPELL_CHECKER_AFF
100 ):
101 raise ImportError
102 from hunspell import HunSpell
104 _HAS_HUNSPELL = True
105except ImportError:
106 _HAS_HUNSPELL = False
109def _read_wordlist(
110 base_dir: Traversable,
111 wordlist_name: str,
112 *,
113 namelist: bool = False,
114) -> Iterable[str]:
115 path = base_dir.joinpath(wordlist_name)
116 with path.open("r", encoding="utf-8") as fd:
117 w = [w.strip() for w in fd]
118 yield from w
119 if namelist:
120 yield from (f"{n}'s" for n in w)
123def _all_debian_archs() -> Iterable[str]:
124 try:
125 output = subprocess.check_output(["dpkg-architecture", "-L"])
126 except (FileNotFoundError, subprocess.CalledProcessError) as e:
127 _warn(f"dpkg-architecture -L failed: {e}")
128 return tuple()
130 return (x.strip() for x in output.decode("utf-8").splitlines())
133@functools.lru_cache
134def _builtin_exception_words() -> frozenset[str]:
135 release_names = (x for x in Release.releases)
136 data_dir = importlib.resources.files(data_wordlist.__name__)
137 return frozenset(
138 itertools.chain(
139 itertools.chain.from_iterable(
140 _read_wordlist(data_dir, wl) for wl in _WORDLISTS
141 ),
142 itertools.chain.from_iterable(
143 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS
144 ),
145 release_names,
146 _all_debian_archs(),
147 )
148 )
151_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None
154def spellcheck_line(
155 lint_state: "LintState",
156 line_no: int,
157 line: str,
158) -> int:
159 spell_checker = lint_state.spellchecker()
160 typos = 0
161 for word, pos, endpos in spell_checker.iter_words(line):
162 corrections = spell_checker.provide_corrections_for(word)
163 if not corrections: 163 ↛ 165line 163 didn't jump to line 165 because the condition on line 163 was always true
164 continue
165 word_range = TERange(
166 TEPosition(line_no, pos),
167 TEPosition(line_no, endpos),
168 )
169 typos += 1
170 lint_state.emit_diagnostic(
171 word_range,
172 f'Spelling "{word}"',
173 "spelling",
174 "debputy",
175 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections],
176 enable_non_interactive_auto_fix=False,
177 )
178 return typos
181def default_spellchecker() -> "Spellchecker":
182 global _DEFAULT_SPELL_CHECKER
183 spellchecker = _DEFAULT_SPELL_CHECKER
184 if spellchecker is None: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 if _HAS_HUNSPELL:
186 spellchecker = HunspellSpellchecker()
187 else:
188 spellchecker = _do_nothing_spellchecker()
189 _DEFAULT_SPELL_CHECKER = spellchecker
190 return spellchecker
193@functools.lru_cache
194def _do_nothing_spellchecker() -> "Spellchecker":
195 return EverythingIsCorrectSpellchecker()
198def disable_spellchecking() -> None:
199 global _DEFAULT_SPELL_CHECKER
200 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()
203def _testing_set_default_spellchecker(
204 spellchecker: Optional["Spellchecker"],
205) -> Optional["Spellchecker"]:
206 global _DEFAULT_SPELL_CHECKER
207 previous = _DEFAULT_SPELL_CHECKER
208 _DEFAULT_SPELL_CHECKER = spellchecker
209 return previous
212def _skip_quoted_parts(line: str) -> Iterable[tuple[str, int]]:
213 current_pos = 0
214 while True:
215 try:
216 m = _FIND_QUOTE_CHAR.search(line, current_pos)
217 if m is None:
218 if current_pos == 0:
219 yield line, 0
220 else:
221 yield line[current_pos:], current_pos
222 return
223 starting_marker_pos = m.span()[0]
224 quote_char = m.group()
225 end_marker_pos = line.index(quote_char, starting_marker_pos + 1)
226 except ValueError:
227 yield line[current_pos:], current_pos
228 return
230 part = line[current_pos:starting_marker_pos]
232 if not part.isspace(): 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true
233 yield part, current_pos
234 current_pos = end_marker_pos + 1
237def _split_line_to_words(line: str) -> Iterable[tuple[str, int, int]]:
238 for line_part, part_pos in _skip_quoted_parts(line):
239 for m in _WORD_PARTS.finditer(line_part):
240 fullword = m.group(1)
241 if fullword.startswith("--"): 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was never true
242 # CLI arg
243 continue
244 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword):
245 continue
246 if _LOOKS_LIKE_FILENAME.match(fullword):
247 continue
248 if _LOOKS_LIKE_EMAIL.match(fullword): 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 continue
250 mpos = m.span(1)[0]
251 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):
252 pos, endpos = sm.span(1)
253 offset = part_pos + mpos
254 yield sm.group(1), pos + offset, endpos + offset
257class Spellchecker:
259 @staticmethod
260 def do_nothing_spellchecker() -> "Spellchecker":
261 return EverythingIsCorrectSpellchecker()
263 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]:
264 yield from _split_line_to_words(line)
266 def provide_corrections_for(self, word: str) -> Iterable[str]:
267 raise NotImplementedError
269 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":
270 if not words: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true
271 return self
272 return ContextIgnoredWordsSpellchecker(self, words)
275class ContextIgnoredWordsSpellchecker(Spellchecker):
277 def __init__(
278 self, spellchecker: Spellchecker, context_ignored_words: Container[str]
279 ) -> None:
280 self._spellchecker = spellchecker
281 self._context_ignored_words = context_ignored_words
283 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]:
284 return self._spellchecker.iter_words(line)
286 def provide_corrections_for(self, word: str) -> Iterable[str]:
287 if word.lower() in self._context_ignored_words:
288 return _NO_CORRECTIONS
289 return self._spellchecker.provide_corrections_for(word)
292class EverythingIsCorrectSpellchecker(Spellchecker):
293 def provide_corrections_for(self, word: str) -> Iterable[str]:
294 return _NO_CORRECTIONS
296 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":
297 return self
300class HunspellSpellchecker(Spellchecker):
302 def __init__(self) -> None:
303 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)
304 for w in _builtin_exception_words():
305 self._checker.add(w)
306 self._load_personal_exclusions()
308 def provide_corrections_for(self, word: str) -> Iterable[str]:
309 if word.startswith( 309 ↛ 321line 309 didn't jump to line 321 because the condition on line 309 was never true
310 (
311 "dpkg-",
312 "dh-",
313 "dh_",
314 "debian-",
315 "debconf-",
316 "update-",
317 "DEB_",
318 "DPKG_",
319 )
320 ):
321 return _NO_CORRECTIONS
322 # 'ing is deliberately forcing a word into another word-class
323 if word.endswith(("'ing", "-nss")): 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true
324 return _NO_CORRECTIONS
325 return self._lookup(word)
327 @functools.lru_cache(128)
328 def _lookup(self, word: str) -> Iterable[str]:
329 if self._checker.spell(word):
330 return _NO_CORRECTIONS
331 return self._checker.suggest(word)
333 def _load_personal_exclusions(self) -> None:
334 for filename in _PERSONAL_DICTS:
335 if filename.startswith("${"): 335 ↛ 342line 335 didn't jump to line 342 because the condition on line 335 was always true
336 end_index = filename.index("}")
337 varname = filename[2:end_index]
338 value = os.environ.get(varname)
339 if value is None: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 continue
341 filename = value + filename[end_index + 1 :]
342 if os.path.isfile(filename): 342 ↛ 343line 342 didn't jump to line 343 because the condition on line 342 was never true
343 _info(f"Loading personal spelling dictionary from {filename}")
344 self._checker.add_dic(filename)