Coverage for src/debputy/lsp/spellchecking.py: 83%
175 statements
« prev ^ index » next coverage.py v7.8.2, created at 2026-06-16 19:34 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2026-06-16 19:34 +0000
1import functools
2import importlib.resources
3import itertools
4import os
5import re
6import subprocess
7from importlib.resources.abc import Traversable
8from typing import Optional, TYPE_CHECKING
9from collections.abc import Iterable, Container
11from debian.debian_support import Release
12from debputy.lsp.quickfixes import propose_correct_text_quick_fix
13from debputy.util import _info, _warn
14import debputy.lsp.data.wordlists as data_wordlist
16try:
17 from debian._deb822_repro.locatable import (
18 Position as TEPosition,
19 Range as TERange,
20 )
21except ImportError:
22 pass
24if TYPE_CHECKING:
25 from debputy.linting.lint_util import LintState
28_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"
29_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"
30_WORD_PARTS = re.compile(r"(\S+)")
31_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")
32_FIND_QUOTE_CHAR = re.compile(r'["`]')
33_LOOKS_LIKE_FILENAME = re.compile(
34 r"""
35 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*
36 | [a-z0-9-_]+(/[a-z0-9]+)+/*
37 | [a-z0-9_]+(/[a-z0-9_]+){2,}/*
38 | (?:\S+)?[.][a-z]{1,3}
39 | \S+(?:_\S+)+
40""",
41 re.VERBOSE,
42)
43_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(
44 r"""
45 (
46 # Java identifier Camel Case
47 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+
48 # Type name Camel Case
49 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+
50 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other
51 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+
52 # Perl module
53 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+
54 # Probably an abbreviation
55 | [A-Z]{3,}
56 # Perl/Python identifiers or Jinja templates
57 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?
58 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)
59 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?
60 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#
61 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words
62 | [a-z][a-z0-9]*(-[a-z0-9]+){2,}
63 # Short args
64 | -[a-z0-9]+
65 # Things like 32bit
66 | \d{2,}-?[a-z]+
67 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)
68 | src:[a-z0-9][-+.a-z0-9]+
69 | [a-z0-9][-+.a-z0-9]+:(?:any|native)
70 # Variables
71 | [$]\S+
72 # Version
73 | v\d+(?:[.]\S+)?
74 # chmod symbolic mode or math
75 | \S*=\S+
76 )
77""",
78 re.VERBOSE,
79)
80_LOOKS_LIKE_EMAIL = re.compile(
81 r"""
82 <[^>@\s]+@[^>@\s]+>
83""",
84 re.VERBOSE,
85)
86_NO_CORRECTIONS = tuple[str]()
87_WORDLISTS = [
88 "debian-wordlist.dic",
89]
90_NAMELISTS = [
91 "logins-and-people.dic",
92]
93_PERSONAL_DICTS = [
94 "${HOME}/.hunspell_default",
95 "${HOME}/.hunspell_en_US",
96]
99try:
100 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was never true
101 _SPELL_CHECKER_AFF
102 ):
103 raise ImportError
104 from hunspell import HunSpell
106 _HAS_HUNSPELL = True
107except ImportError:
108 _HAS_HUNSPELL = False
111def _read_wordlist(
112 base_dir: Traversable,
113 wordlist_name: str,
114 *,
115 namelist: bool = False,
116) -> Iterable[str]:
117 path = base_dir.joinpath(wordlist_name)
118 with path.open("r", encoding="utf-8") as fd:
119 w = [w.strip() for w in fd]
120 yield from w
121 if namelist:
122 yield from (f"{n}'s" for n in w)
125def _all_debian_archs() -> Iterable[str]:
126 try:
127 output = subprocess.check_output(["dpkg-architecture", "-L"])
128 except (FileNotFoundError, subprocess.CalledProcessError) as e:
129 _warn(f"dpkg-architecture -L failed: {e}")
130 return ()
132 return (x.strip() for x in output.decode("utf-8").splitlines())
135@functools.lru_cache
136def _builtin_exception_words() -> frozenset[str]:
137 data_dir = importlib.resources.files(data_wordlist.__name__)
138 return frozenset(
139 itertools.chain(
140 itertools.chain.from_iterable(
141 _read_wordlist(data_dir, wl) for wl in _WORDLISTS
142 ),
143 itertools.chain.from_iterable(
144 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS
145 ),
146 Release.releases,
147 _all_debian_archs(),
148 )
149 )
152_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None
155def spellcheck_line(
156 lint_state: "LintState",
157 line_no: int,
158 line: str,
159) -> int:
160 spell_checker = lint_state.spellchecker()
161 typos = 0
162 for word, pos, endpos in spell_checker.iter_words(line):
163 corrections = spell_checker.provide_corrections_for(word)
164 if not corrections: 164 ↛ 166line 164 didn't jump to line 166 because the condition on line 164 was always true
165 continue
166 word_range = TERange(
167 TEPosition(line_no, pos),
168 TEPosition(line_no, endpos),
169 )
170 typos += 1
171 lint_state.emit_diagnostic(
172 word_range,
173 f'Spelling "{word}"',
174 "spelling",
175 "debputy",
176 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections],
177 enable_non_interactive_auto_fix=False,
178 )
179 return typos
182def default_spellchecker() -> "Spellchecker":
183 global _DEFAULT_SPELL_CHECKER
184 spellchecker = _DEFAULT_SPELL_CHECKER
185 if spellchecker is None: 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 if _HAS_HUNSPELL:
187 spellchecker = HunspellSpellchecker()
188 else:
189 spellchecker = _do_nothing_spellchecker()
190 _DEFAULT_SPELL_CHECKER = spellchecker
191 return spellchecker
194@functools.lru_cache
195def _do_nothing_spellchecker() -> "Spellchecker":
196 return EverythingIsCorrectSpellchecker()
199def disable_spellchecking() -> None:
200 global _DEFAULT_SPELL_CHECKER
201 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()
204def _testing_set_default_spellchecker(
205 spellchecker: Optional["Spellchecker"],
206) -> Optional["Spellchecker"]:
207 global _DEFAULT_SPELL_CHECKER
208 previous = _DEFAULT_SPELL_CHECKER
209 _DEFAULT_SPELL_CHECKER = spellchecker
210 return previous
213def _skip_quoted_parts(line: str) -> Iterable[tuple[str, int]]:
214 current_pos = 0
215 while True:
216 try:
217 m = _FIND_QUOTE_CHAR.search(line, current_pos)
218 if m is None:
219 if current_pos == 0:
220 yield line, 0
221 else:
222 yield line[current_pos:], current_pos
223 return
224 starting_marker_pos = m.span()[0]
225 quote_char = m.group()
226 end_marker_pos = line.index(quote_char, starting_marker_pos + 1)
227 except ValueError:
228 yield line[current_pos:], current_pos
229 return
231 part = line[current_pos:starting_marker_pos]
233 if not part.isspace(): 233 ↛ 235line 233 didn't jump to line 235 because the condition on line 233 was always true
234 yield part, current_pos
235 current_pos = end_marker_pos + 1
238def _split_line_to_words(line: str) -> Iterable[tuple[str, int, int]]:
239 for line_part, part_pos in _skip_quoted_parts(line):
240 for m in _WORD_PARTS.finditer(line_part):
241 fullword = m.group(1)
242 if fullword.startswith("--"): 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was never true
243 # CLI arg
244 continue
245 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword):
246 continue
247 if _LOOKS_LIKE_FILENAME.match(fullword):
248 continue
249 if _LOOKS_LIKE_EMAIL.match(fullword): 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true
250 continue
251 mpos = m.span(1)[0]
252 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):
253 pos, endpos = sm.span(1)
254 offset = part_pos + mpos
255 yield sm.group(1), pos + offset, endpos + offset
258class Spellchecker:
260 @staticmethod
261 def do_nothing_spellchecker() -> "Spellchecker":
262 return EverythingIsCorrectSpellchecker()
264 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]:
265 yield from _split_line_to_words(line)
267 def provide_corrections_for(self, word: str) -> Iterable[str]:
268 raise NotImplementedError
270 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":
271 if not words: 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true
272 return self
273 return ContextIgnoredWordsSpellchecker(self, words)
276class ContextIgnoredWordsSpellchecker(Spellchecker):
278 def __init__(
279 self, spellchecker: Spellchecker, context_ignored_words: Container[str]
280 ) -> None:
281 self._spellchecker = spellchecker
282 self._context_ignored_words = context_ignored_words
284 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]:
285 return self._spellchecker.iter_words(line)
287 def provide_corrections_for(self, word: str) -> Iterable[str]:
288 if word.lower() in self._context_ignored_words:
289 return _NO_CORRECTIONS
290 return self._spellchecker.provide_corrections_for(word)
293class EverythingIsCorrectSpellchecker(Spellchecker):
294 def provide_corrections_for(self, word: str) -> Iterable[str]:
295 return _NO_CORRECTIONS
297 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":
298 return self
301class HunspellSpellchecker(Spellchecker):
303 def __init__(self) -> None:
304 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)
305 for w in _builtin_exception_words():
306 self._checker.add(w)
307 self._load_personal_exclusions()
309 def provide_corrections_for(self, word: str) -> Iterable[str]:
310 if word.startswith(
311 (
312 "dpkg-",
313 "dh-",
314 "dh_",
315 "debian-",
316 "debconf-",
317 "update-",
318 "DEB_",
319 "DPKG_",
320 )
321 ):
322 return _NO_CORRECTIONS
323 # 'ing is deliberately forcing a word into another word-class
324 if word.endswith(("'ing", "-nss")): 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true
325 return _NO_CORRECTIONS
326 return self._lookup(word)
328 @functools.lru_cache(128)
329 def _lookup(self, word: str) -> Iterable[str]:
330 if self._checker.spell(word):
331 return _NO_CORRECTIONS
332 return self._checker.suggest(word)
334 def _load_personal_exclusions(self) -> None:
335 for filename in _PERSONAL_DICTS:
336 if filename.startswith("${"): 336 ↛ 343line 336 didn't jump to line 343 because the condition on line 336 was always true
337 end_index = filename.index("}")
338 varname = filename[2:end_index]
339 value = os.environ.get(varname)
340 if value is None: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true
341 continue
342 filename = value + filename[end_index + 1 :]
343 if os.path.isfile(filename): 343 ↛ 344line 343 didn't jump to line 344 because the condition on line 343 was never true
344 _info(f"Loading personal spelling dictionary from {filename}")
345 self._checker.add_dic(filename)