Coverage for src/debputy/lsp/spellchecking.py: 83%

175 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2026-06-16 19:34 +0000

1import functools 

2import importlib.resources 

3import itertools 

4import os 

5import re 

6import subprocess 

7from importlib.resources.abc import Traversable 

8from typing import Optional, TYPE_CHECKING 

9from collections.abc import Iterable, Container 

10 

11from debian.debian_support import Release 

12from debputy.lsp.quickfixes import propose_correct_text_quick_fix 

13from debputy.util import _info, _warn 

14import debputy.lsp.data.wordlists as data_wordlist 

15 

16try: 

17 from debian._deb822_repro.locatable import ( 

18 Position as TEPosition, 

19 Range as TERange, 

20 ) 

21except ImportError: 

22 pass 

23 

24if TYPE_CHECKING: 

25 from debputy.linting.lint_util import LintState 

26 

27 

28_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic" 

29_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff" 

30_WORD_PARTS = re.compile(r"(\S+)") 

31_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)") 

32_FIND_QUOTE_CHAR = re.compile(r'["`]') 

33_LOOKS_LIKE_FILENAME = re.compile( 

34 r""" 

35 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/* 

36 | [a-z0-9-_]+(/[a-z0-9]+)+/* 

37 | [a-z0-9_]+(/[a-z0-9_]+){2,}/* 

38 | (?:\S+)?[.][a-z]{1,3} 

39 | \S+(?:_\S+)+ 

40""", 

41 re.VERBOSE, 

42) 

43_LOOKS_LIKE_PROGRAMMING_TERM = re.compile( 

44 r""" 

45 ( 

46 # Java identifier Camel Case 

47 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+ 

48 # Type name Camel Case 

49 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+ 

50 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other 

51 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+ 

52 # Perl module 

53 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+ 

54 # Probably an abbreviation 

55 | [A-Z]{3,} 

56 # Perl/Python identifiers or Jinja templates 

57 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)? 

58 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO) 

59 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)? 

60 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\# 

61 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words 

62 | [a-z][a-z0-9]*(-[a-z0-9]+){2,} 

63 # Short args 

64 | -[a-z0-9]+ 

65 # Things like 32bit 

66 | \d{2,}-?[a-z]+ 

67 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words) 

68 | src:[a-z0-9][-+.a-z0-9]+ 

69 | [a-z0-9][-+.a-z0-9]+:(?:any|native) 

70 # Variables 

71 | [$]\S+ 

72 # Version 

73 | v\d+(?:[.]\S+)? 

74 # chmod symbolic mode or math 

75 | \S*=\S+ 

76 ) 

77""", 

78 re.VERBOSE, 

79) 

80_LOOKS_LIKE_EMAIL = re.compile( 

81 r""" 

82 <[^>@\s]+@[^>@\s]+> 

83""", 

84 re.VERBOSE, 

85) 

86_NO_CORRECTIONS = tuple[str]() 

87_WORDLISTS = [ 

88 "debian-wordlist.dic", 

89] 

90_NAMELISTS = [ 

91 "logins-and-people.dic", 

92] 

93_PERSONAL_DICTS = [ 

94 "${HOME}/.hunspell_default", 

95 "${HOME}/.hunspell_en_US", 

96] 

97 

98 

99try: 

100 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was never true

101 _SPELL_CHECKER_AFF 

102 ): 

103 raise ImportError 

104 from hunspell import HunSpell 

105 

106 _HAS_HUNSPELL = True 

107except ImportError: 

108 _HAS_HUNSPELL = False 

109 

110 

111def _read_wordlist( 

112 base_dir: Traversable, 

113 wordlist_name: str, 

114 *, 

115 namelist: bool = False, 

116) -> Iterable[str]: 

117 path = base_dir.joinpath(wordlist_name) 

118 with path.open("r", encoding="utf-8") as fd: 

119 w = [w.strip() for w in fd] 

120 yield from w 

121 if namelist: 

122 yield from (f"{n}'s" for n in w) 

123 

124 

125def _all_debian_archs() -> Iterable[str]: 

126 try: 

127 output = subprocess.check_output(["dpkg-architecture", "-L"]) 

128 except (FileNotFoundError, subprocess.CalledProcessError) as e: 

129 _warn(f"dpkg-architecture -L failed: {e}") 

130 return () 

131 

132 return (x.strip() for x in output.decode("utf-8").splitlines()) 

133 

134 

135@functools.lru_cache 

136def _builtin_exception_words() -> frozenset[str]: 

137 data_dir = importlib.resources.files(data_wordlist.__name__) 

138 return frozenset( 

139 itertools.chain( 

140 itertools.chain.from_iterable( 

141 _read_wordlist(data_dir, wl) for wl in _WORDLISTS 

142 ), 

143 itertools.chain.from_iterable( 

144 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS 

145 ), 

146 Release.releases, 

147 _all_debian_archs(), 

148 ) 

149 ) 

150 

151 

152_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None 

153 

154 

155def spellcheck_line( 

156 lint_state: "LintState", 

157 line_no: int, 

158 line: str, 

159) -> int: 

160 spell_checker = lint_state.spellchecker() 

161 typos = 0 

162 for word, pos, endpos in spell_checker.iter_words(line): 

163 corrections = spell_checker.provide_corrections_for(word) 

164 if not corrections: 164 ↛ 166line 164 didn't jump to line 166 because the condition on line 164 was always true

165 continue 

166 word_range = TERange( 

167 TEPosition(line_no, pos), 

168 TEPosition(line_no, endpos), 

169 ) 

170 typos += 1 

171 lint_state.emit_diagnostic( 

172 word_range, 

173 f'Spelling "{word}"', 

174 "spelling", 

175 "debputy", 

176 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections], 

177 enable_non_interactive_auto_fix=False, 

178 ) 

179 return typos 

180 

181 

182def default_spellchecker() -> "Spellchecker": 

183 global _DEFAULT_SPELL_CHECKER 

184 spellchecker = _DEFAULT_SPELL_CHECKER 

185 if spellchecker is None: 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 if _HAS_HUNSPELL: 

187 spellchecker = HunspellSpellchecker() 

188 else: 

189 spellchecker = _do_nothing_spellchecker() 

190 _DEFAULT_SPELL_CHECKER = spellchecker 

191 return spellchecker 

192 

193 

194@functools.lru_cache 

195def _do_nothing_spellchecker() -> "Spellchecker": 

196 return EverythingIsCorrectSpellchecker() 

197 

198 

199def disable_spellchecking() -> None: 

200 global _DEFAULT_SPELL_CHECKER 

201 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker() 

202 

203 

204def _testing_set_default_spellchecker( 

205 spellchecker: Optional["Spellchecker"], 

206) -> Optional["Spellchecker"]: 

207 global _DEFAULT_SPELL_CHECKER 

208 previous = _DEFAULT_SPELL_CHECKER 

209 _DEFAULT_SPELL_CHECKER = spellchecker 

210 return previous 

211 

212 

213def _skip_quoted_parts(line: str) -> Iterable[tuple[str, int]]: 

214 current_pos = 0 

215 while True: 

216 try: 

217 m = _FIND_QUOTE_CHAR.search(line, current_pos) 

218 if m is None: 

219 if current_pos == 0: 

220 yield line, 0 

221 else: 

222 yield line[current_pos:], current_pos 

223 return 

224 starting_marker_pos = m.span()[0] 

225 quote_char = m.group() 

226 end_marker_pos = line.index(quote_char, starting_marker_pos + 1) 

227 except ValueError: 

228 yield line[current_pos:], current_pos 

229 return 

230 

231 part = line[current_pos:starting_marker_pos] 

232 

233 if not part.isspace(): 233 ↛ 235line 233 didn't jump to line 235 because the condition on line 233 was always true

234 yield part, current_pos 

235 current_pos = end_marker_pos + 1 

236 

237 

238def _split_line_to_words(line: str) -> Iterable[tuple[str, int, int]]: 

239 for line_part, part_pos in _skip_quoted_parts(line): 

240 for m in _WORD_PARTS.finditer(line_part): 

241 fullword = m.group(1) 

242 if fullword.startswith("--"): 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was never true

243 # CLI arg 

244 continue 

245 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword): 

246 continue 

247 if _LOOKS_LIKE_FILENAME.match(fullword): 

248 continue 

249 if _LOOKS_LIKE_EMAIL.match(fullword): 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true

250 continue 

251 mpos = m.span(1)[0] 

252 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword): 

253 pos, endpos = sm.span(1) 

254 offset = part_pos + mpos 

255 yield sm.group(1), pos + offset, endpos + offset 

256 

257 

258class Spellchecker: 

259 

260 @staticmethod 

261 def do_nothing_spellchecker() -> "Spellchecker": 

262 return EverythingIsCorrectSpellchecker() 

263 

264 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]: 

265 yield from _split_line_to_words(line) 

266 

267 def provide_corrections_for(self, word: str) -> Iterable[str]: 

268 raise NotImplementedError 

269 

270 def context_ignored_words(self, words: Container[str]) -> "Spellchecker": 

271 if not words: 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 return self 

273 return ContextIgnoredWordsSpellchecker(self, words) 

274 

275 

276class ContextIgnoredWordsSpellchecker(Spellchecker): 

277 

278 def __init__( 

279 self, spellchecker: Spellchecker, context_ignored_words: Container[str] 

280 ) -> None: 

281 self._spellchecker = spellchecker 

282 self._context_ignored_words = context_ignored_words 

283 

284 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]: 

285 return self._spellchecker.iter_words(line) 

286 

287 def provide_corrections_for(self, word: str) -> Iterable[str]: 

288 if word.lower() in self._context_ignored_words: 

289 return _NO_CORRECTIONS 

290 return self._spellchecker.provide_corrections_for(word) 

291 

292 

293class EverythingIsCorrectSpellchecker(Spellchecker): 

294 def provide_corrections_for(self, word: str) -> Iterable[str]: 

295 return _NO_CORRECTIONS 

296 

297 def context_ignored_words(self, words: Container[str]) -> "Spellchecker": 

298 return self 

299 

300 

301class HunspellSpellchecker(Spellchecker): 

302 

303 def __init__(self) -> None: 

304 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF) 

305 for w in _builtin_exception_words(): 

306 self._checker.add(w) 

307 self._load_personal_exclusions() 

308 

309 def provide_corrections_for(self, word: str) -> Iterable[str]: 

310 if word.startswith( 

311 ( 

312 "dpkg-", 

313 "dh-", 

314 "dh_", 

315 "debian-", 

316 "debconf-", 

317 "update-", 

318 "DEB_", 

319 "DPKG_", 

320 ) 

321 ): 

322 return _NO_CORRECTIONS 

323 # 'ing is deliberately forcing a word into another word-class 

324 if word.endswith(("'ing", "-nss")): 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true

325 return _NO_CORRECTIONS 

326 return self._lookup(word) 

327 

328 @functools.lru_cache(128) 

329 def _lookup(self, word: str) -> Iterable[str]: 

330 if self._checker.spell(word): 

331 return _NO_CORRECTIONS 

332 return self._checker.suggest(word) 

333 

334 def _load_personal_exclusions(self) -> None: 

335 for filename in _PERSONAL_DICTS: 

336 if filename.startswith("${"): 336 ↛ 343line 336 didn't jump to line 343 because the condition on line 336 was always true

337 end_index = filename.index("}") 

338 varname = filename[2:end_index] 

339 value = os.environ.get(varname) 

340 if value is None: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 continue 

342 filename = value + filename[end_index + 1 :] 

343 if os.path.isfile(filename): 343 ↛ 344line 343 didn't jump to line 344 because the condition on line 343 was never true

344 _info(f"Loading personal spelling dictionary from {filename}") 

345 self._checker.add_dic(filename)