Coverage for src/debputy/lsp/spellchecking.py: 82%

176 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-10-12 15:06 +0000

1import functools 

2import importlib.resources 

3import itertools 

4import os 

5import re 

6import subprocess 

7from importlib.resources.abc import Traversable 

8from typing import FrozenSet, Tuple, Optional, TYPE_CHECKING 

9from collections.abc import Iterable, Container 

10 

11from debian.debian_support import Release 

12from debputy.lsp.quickfixes import propose_correct_text_quick_fix 

13from debputy.util import _info, _warn 

14import debputy.lsp.data.wordlists as data_wordlist 

15 

16try: 

17 from debputy.lsp.vendoring._deb822_repro.locatable import ( 

18 Position as TEPosition, 

19 Range as TERange, 

20 ) 

21except ImportError: 

22 pass 

23 

24if TYPE_CHECKING: 

25 from debputy.linting.lint_util import LintState 

26 

27 

28_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic" 

29_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff" 

30_WORD_PARTS = re.compile(r"(\S+)") 

31_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)") 

32_FIND_QUOTE_CHAR = re.compile(r'["`]') 

33_LOOKS_LIKE_FILENAME = re.compile( 

34 r""" 

35 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/* 

36 | [a-z0-9-_]+(/[a-z0-9]+)+/* 

37 | [a-z0-9_]+(/[a-z0-9_]+){2,}/* 

38 | (?:\S+)?[.][a-z]{1,3} 

39 

40""", 

41 re.VERBOSE, 

42) 

43_LOOKS_LIKE_PROGRAMMING_TERM = re.compile( 

44 r""" 

45 ( 

46 # Java identifier Camel Case 

47 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+ 

48 # Type name Camel Case 

49 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+ 

50 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other 

51 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+ 

52 # Perl module 

53 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+ 

54 # Probably an abbreviation 

55 | [A-Z]{3,} 

56 # Perl/Python identifiers or Jinja templates 

57 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)? 

58 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO) 

59 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)? 

60 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\# 

61 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words 

62 | [a-z][a-z0-9]*(-[a-z0-9]+){2,} 

63 # Short args 

64 | -[a-z0-9]+ 

65 # Things like 32bit 

66 | \d{2,}-?[a-z]+ 

67 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words) 

68 | src:[a-z0-9][-+.a-z0-9]+ 

69 | [a-z0-9][-+.a-z0-9]+:(?:any|native) 

70 # Version 

71 | v\d+(?:[.]\S+)? 

72 # chmod symbolic mode or math 

73 | \S*=\S+ 

74 ) 

75""", 

76 re.VERBOSE, 

77) 

78_LOOKS_LIKE_EMAIL = re.compile( 

79 r""" 

80 <[^>@\s]+@[^>@\s]+> 

81""", 

82 re.VERBOSE, 

83) 

84_NO_CORRECTIONS = tuple() 

85_WORDLISTS = [ 

86 "debian-wordlist.dic", 

87] 

88_NAMELISTS = [ 

89 "logins-and-people.dic", 

90] 

91_PERSONAL_DICTS = [ 

92 "${HOME}/.hunspell_default", 

93 "${HOME}/.hunspell_en_US", 

94] 

95 

96 

97try: 

98 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was never true

99 _SPELL_CHECKER_AFF 

100 ): 

101 raise ImportError 

102 from hunspell import HunSpell 

103 

104 _HAS_HUNSPELL = True 

105except ImportError: 

106 _HAS_HUNSPELL = False 

107 

108 

109def _read_wordlist( 

110 base_dir: Traversable, 

111 wordlist_name: str, 

112 *, 

113 namelist: bool = False, 

114) -> Iterable[str]: 

115 path = base_dir.joinpath(wordlist_name) 

116 with path.open("r", encoding="utf-8") as fd: 

117 w = [w.strip() for w in fd] 

118 yield from w 

119 if namelist: 

120 yield from (f"{n}'s" for n in w) 

121 

122 

123def _all_debian_archs() -> Iterable[str]: 

124 try: 

125 output = subprocess.check_output(["dpkg-architecture", "-L"]) 

126 except (FileNotFoundError, subprocess.CalledProcessError) as e: 

127 _warn(f"dpkg-architecture -L failed: {e}") 

128 return tuple() 

129 

130 return (x.strip() for x in output.decode("utf-8").splitlines()) 

131 

132 

133@functools.lru_cache 

134def _builtin_exception_words() -> frozenset[str]: 

135 release_names = (x for x in Release.releases) 

136 data_dir = importlib.resources.files(data_wordlist.__name__) 

137 return frozenset( 

138 itertools.chain( 

139 itertools.chain.from_iterable( 

140 _read_wordlist(data_dir, wl) for wl in _WORDLISTS 

141 ), 

142 itertools.chain.from_iterable( 

143 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS 

144 ), 

145 release_names, 

146 _all_debian_archs(), 

147 ) 

148 ) 

149 

150 

151_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None 

152 

153 

154def spellcheck_line( 

155 lint_state: "LintState", 

156 line_no: int, 

157 line: str, 

158) -> int: 

159 spell_checker = lint_state.spellchecker() 

160 typos = 0 

161 for word, pos, endpos in spell_checker.iter_words(line): 

162 corrections = spell_checker.provide_corrections_for(word) 

163 if not corrections: 163 ↛ 165line 163 didn't jump to line 165 because the condition on line 163 was always true

164 continue 

165 word_range = TERange( 

166 TEPosition(line_no, pos), 

167 TEPosition(line_no, endpos), 

168 ) 

169 typos += 1 

170 lint_state.emit_diagnostic( 

171 word_range, 

172 f'Spelling "{word}"', 

173 "spelling", 

174 "debputy", 

175 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections], 

176 enable_non_interactive_auto_fix=False, 

177 ) 

178 return typos 

179 

180 

181def default_spellchecker() -> "Spellchecker": 

182 global _DEFAULT_SPELL_CHECKER 

183 spellchecker = _DEFAULT_SPELL_CHECKER 

184 if spellchecker is None: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 if _HAS_HUNSPELL: 

186 spellchecker = HunspellSpellchecker() 

187 else: 

188 spellchecker = _do_nothing_spellchecker() 

189 _DEFAULT_SPELL_CHECKER = spellchecker 

190 return spellchecker 

191 

192 

193@functools.lru_cache 

194def _do_nothing_spellchecker() -> "Spellchecker": 

195 return EverythingIsCorrectSpellchecker() 

196 

197 

198def disable_spellchecking() -> None: 

199 global _DEFAULT_SPELL_CHECKER 

200 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker() 

201 

202 

203def _testing_set_default_spellchecker( 

204 spellchecker: Optional["Spellchecker"], 

205) -> Optional["Spellchecker"]: 

206 global _DEFAULT_SPELL_CHECKER 

207 previous = _DEFAULT_SPELL_CHECKER 

208 _DEFAULT_SPELL_CHECKER = spellchecker 

209 return previous 

210 

211 

212def _skip_quoted_parts(line: str) -> Iterable[tuple[str, int]]: 

213 current_pos = 0 

214 while True: 

215 try: 

216 m = _FIND_QUOTE_CHAR.search(line, current_pos) 

217 if m is None: 

218 if current_pos == 0: 

219 yield line, 0 

220 else: 

221 yield line[current_pos:], current_pos 

222 return 

223 starting_marker_pos = m.span()[0] 

224 quote_char = m.group() 

225 end_marker_pos = line.index(quote_char, starting_marker_pos + 1) 

226 except ValueError: 

227 yield line[current_pos:], current_pos 

228 return 

229 

230 part = line[current_pos:starting_marker_pos] 

231 

232 if not part.isspace(): 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true

233 yield part, current_pos 

234 current_pos = end_marker_pos + 1 

235 

236 

237def _split_line_to_words(line: str) -> Iterable[tuple[str, int, int]]: 

238 for line_part, part_pos in _skip_quoted_parts(line): 

239 for m in _WORD_PARTS.finditer(line_part): 

240 fullword = m.group(1) 

241 if fullword.startswith("--"): 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was never true

242 # CLI arg 

243 continue 

244 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword): 

245 continue 

246 if _LOOKS_LIKE_FILENAME.match(fullword): 

247 continue 

248 if _LOOKS_LIKE_EMAIL.match(fullword): 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 continue 

250 mpos = m.span(1)[0] 

251 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword): 

252 pos, endpos = sm.span(1) 

253 offset = part_pos + mpos 

254 yield sm.group(1), pos + offset, endpos + offset 

255 

256 

257class Spellchecker: 

258 

259 @staticmethod 

260 def do_nothing_spellchecker() -> "Spellchecker": 

261 return EverythingIsCorrectSpellchecker() 

262 

263 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]: 

264 yield from _split_line_to_words(line) 

265 

266 def provide_corrections_for(self, word: str) -> Iterable[str]: 

267 raise NotImplementedError 

268 

269 def context_ignored_words(self, words: Container[str]) -> "Spellchecker": 

270 if not words: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true

271 return self 

272 return ContextIgnoredWordsSpellchecker(self, words) 

273 

274 

275class ContextIgnoredWordsSpellchecker(Spellchecker): 

276 

277 def __init__( 

278 self, spellchecker: Spellchecker, context_ignored_words: Container[str] 

279 ) -> None: 

280 self._spellchecker = spellchecker 

281 self._context_ignored_words = context_ignored_words 

282 

283 def iter_words(self, line: str) -> Iterable[tuple[str, int, int]]: 

284 return self._spellchecker.iter_words(line) 

285 

286 def provide_corrections_for(self, word: str) -> Iterable[str]: 

287 if word.lower() in self._context_ignored_words: 

288 return _NO_CORRECTIONS 

289 return self._spellchecker.provide_corrections_for(word) 

290 

291 

292class EverythingIsCorrectSpellchecker(Spellchecker): 

293 def provide_corrections_for(self, word: str) -> Iterable[str]: 

294 return _NO_CORRECTIONS 

295 

296 def context_ignored_words(self, words: Container[str]) -> "Spellchecker": 

297 return self 

298 

299 

300class HunspellSpellchecker(Spellchecker): 

301 

302 def __init__(self) -> None: 

303 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF) 

304 for w in _builtin_exception_words(): 

305 self._checker.add(w) 

306 self._load_personal_exclusions() 

307 

308 def provide_corrections_for(self, word: str) -> Iterable[str]: 

309 if word.startswith( 309 ↛ 321line 309 didn't jump to line 321 because the condition on line 309 was never true

310 ( 

311 "dpkg-", 

312 "dh-", 

313 "dh_", 

314 "debian-", 

315 "debconf-", 

316 "update-", 

317 "DEB_", 

318 "DPKG_", 

319 ) 

320 ): 

321 return _NO_CORRECTIONS 

322 # 'ing is deliberately forcing a word into another word-class 

323 if word.endswith(("'ing", "-nss")): 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true

324 return _NO_CORRECTIONS 

325 return self._lookup(word) 

326 

327 @functools.lru_cache(128) 

328 def _lookup(self, word: str) -> Iterable[str]: 

329 if self._checker.spell(word): 

330 return _NO_CORRECTIONS 

331 return self._checker.suggest(word) 

332 

333 def _load_personal_exclusions(self) -> None: 

334 for filename in _PERSONAL_DICTS: 

335 if filename.startswith("${"): 335 ↛ 342line 335 didn't jump to line 342 because the condition on line 335 was always true

336 end_index = filename.index("}") 

337 varname = filename[2:end_index] 

338 value = os.environ.get(varname) 

339 if value is None: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 continue 

341 filename = value + filename[end_index + 1 :] 

342 if os.path.isfile(filename): 342 ↛ 343line 342 didn't jump to line 343 because the condition on line 342 was never true

343 _info(f"Loading personal spelling dictionary from {filename}") 

344 self._checker.add_dic(filename)