Coverage for src/debputy/lsp/spellchecking.py: 83%

175 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2025-01-27 13:59 +0000

1import functools 

2import importlib.resources 

3import itertools 

4import os 

5import re 

6import subprocess 

7from importlib.resources.abc import Traversable 

8from typing import Iterable, FrozenSet, Tuple, Optional, Container, TYPE_CHECKING 

9 

10from debian.debian_support import Release 

11from debputy.lsp.quickfixes import propose_correct_text_quick_fix 

12from debputy.util import _info, _warn 

13import debputy.lsp.data.wordlists as data_wordlist 

14 

15try: 

16 from debputy.lsp.vendoring._deb822_repro.locatable import ( 

17 Position as TEPosition, 

18 Range as TERange, 

19 ) 

20except ImportError: 

21 pass 

22 

23if TYPE_CHECKING: 

24 from debputy.linting.lint_util import LintState 

25 

26 

27_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic" 

28_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff" 

29_WORD_PARTS = re.compile(r"(\S+)") 

30_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)") 

31_FIND_QUOTE_CHAR = re.compile(r'["`]') 

32_LOOKS_LIKE_FILENAME = re.compile( 

33 r""" 

34 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/* 

35 | [a-z0-9-_]+(/[a-z0-9]+)+/* 

36 | [a-z0-9_]+(/[a-z0-9_]+){2,}/* 

37 | (?:\S+)?[.][a-z]{1,3} 

38 

39""", 

40 re.VERBOSE, 

41) 

42_LOOKS_LIKE_PROGRAMMING_TERM = re.compile( 

43 r""" 

44 ( 

45 # Java identifier Camel Case 

46 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+ 

47 # Type name Camel Case 

48 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+ 

49 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other 

50 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+ 

51 # Perl module 

52 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+ 

53 # Probably an abbreviation 

54 | [A-Z]{3,} 

55 # Perl/Python identifiers or Jinja templates 

56 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)? 

57 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO) 

58 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)? 

59 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\# 

60 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words 

61 | [a-z][a-z0-9]*(-[a-z0-9]+){2,} 

62 # Short args 

63 | -[a-z0-9]+ 

64 # Things like 32bit 

65 | \d{2,}-?[a-z]+ 

66 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words) 

67 | src:[a-z0-9][-+.a-z0-9]+ 

68 | [a-z0-9][-+.a-z0-9]+:(?:any|native) 

69 # Version 

70 | v\d+(?:[.]\S+)? 

71 # chmod symbolic mode or math 

72 | \S*=\S+ 

73 ) 

74""", 

75 re.VERBOSE, 

76) 

77_LOOKS_LIKE_EMAIL = re.compile( 

78 r""" 

79 <[^>@\s]+@[^>@\s]+> 

80""", 

81 re.VERBOSE, 

82) 

83_NO_CORRECTIONS = tuple() 

84_WORDLISTS = [ 

85 "debian-wordlist.dic", 

86] 

87_NAMELISTS = [ 

88 "logins-and-people.dic", 

89] 

90_PERSONAL_DICTS = [ 

91 "${HOME}/.hunspell_default", 

92 "${HOME}/.hunspell_en_US", 

93] 

94 

95 

96try: 

97 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was never true

98 _SPELL_CHECKER_AFF 

99 ): 

100 raise ImportError 

101 from hunspell import HunSpell 

102 

103 _HAS_HUNSPELL = True 

104except ImportError: 

105 _HAS_HUNSPELL = False 

106 

107 

108def _read_wordlist( 

109 base_dir: Traversable, 

110 wordlist_name: str, 

111 *, 

112 namelist: bool = False, 

113) -> Iterable[str]: 

114 path = base_dir.joinpath(wordlist_name) 

115 with path.open("r", encoding="utf-8") as fd: 

116 w = [w.strip() for w in fd] 

117 yield from w 

118 if namelist: 

119 yield from (f"{n}'s" for n in w) 

120 

121 

122def _all_debian_archs() -> Iterable[str]: 

123 try: 

124 output = subprocess.check_output(["dpkg-architecture", "-L"]) 

125 except (FileNotFoundError, subprocess.CalledProcessError) as e: 

126 _warn(f"dpkg-architecture -L failed: {e}") 

127 return tuple() 

128 

129 return (x.strip() for x in output.decode("utf-8").splitlines()) 

130 

131 

132@functools.lru_cache 

133def _builtin_exception_words() -> FrozenSet[str]: 

134 release_names = (x for x in Release.releases) 

135 data_dir = importlib.resources.files(data_wordlist.__name__) 

136 return frozenset( 

137 itertools.chain( 

138 itertools.chain.from_iterable( 

139 _read_wordlist(data_dir, wl) for wl in _WORDLISTS 

140 ), 

141 itertools.chain.from_iterable( 

142 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS 

143 ), 

144 release_names, 

145 _all_debian_archs(), 

146 ) 

147 ) 

148 

149 

150_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None 

151 

152 

153def spellcheck_line( 

154 lint_state: "LintState", 

155 line_no: int, 

156 line: str, 

157) -> int: 

158 spell_checker = lint_state.spellchecker() 

159 typos = 0 

160 for word, pos, endpos in spell_checker.iter_words(line): 

161 corrections = spell_checker.provide_corrections_for(word) 

162 if not corrections: 162 ↛ 164line 162 didn't jump to line 164 because the condition on line 162 was always true

163 continue 

164 word_range = TERange( 

165 TEPosition(line_no, pos), 

166 TEPosition(line_no, endpos), 

167 ) 

168 typos += 1 

169 lint_state.emit_diagnostic( 

170 word_range, 

171 f'Spelling "{word}"', 

172 "spelling", 

173 "debputy", 

174 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections], 

175 enable_non_interactive_auto_fix=False, 

176 ) 

177 return typos 

178 

179 

180def default_spellchecker() -> "Spellchecker": 

181 global _DEFAULT_SPELL_CHECKER 

182 spellchecker = _DEFAULT_SPELL_CHECKER 

183 if spellchecker is None: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 if _HAS_HUNSPELL: 

185 spellchecker = HunspellSpellchecker() 

186 else: 

187 spellchecker = _do_nothing_spellchecker() 

188 _DEFAULT_SPELL_CHECKER = spellchecker 

189 return spellchecker 

190 

191 

192@functools.lru_cache() 

193def _do_nothing_spellchecker() -> "Spellchecker": 

194 return EverythingIsCorrectSpellchecker() 

195 

196 

197def disable_spellchecking() -> None: 

198 global _DEFAULT_SPELL_CHECKER 

199 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker() 

200 

201 

202def _testing_set_default_spellchecker( 

203 spellchecker: Optional["Spellchecker"], 

204) -> Optional["Spellchecker"]: 

205 global _DEFAULT_SPELL_CHECKER 

206 previous = _DEFAULT_SPELL_CHECKER 

207 _DEFAULT_SPELL_CHECKER = spellchecker 

208 return previous 

209 

210 

211def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]: 

212 current_pos = 0 

213 while True: 

214 try: 

215 m = _FIND_QUOTE_CHAR.search(line, current_pos) 

216 if m is None: 

217 if current_pos == 0: 

218 yield line, 0 

219 else: 

220 yield line[current_pos:], current_pos 

221 return 

222 starting_marker_pos = m.span()[0] 

223 quote_char = m.group() 

224 end_marker_pos = line.index(quote_char, starting_marker_pos + 1) 

225 except ValueError: 

226 yield line[current_pos:], current_pos 

227 return 

228 

229 part = line[current_pos:starting_marker_pos] 

230 

231 if not part.isspace(): 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true

232 yield part, current_pos 

233 current_pos = end_marker_pos + 1 

234 

235 

236def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]: 

237 for line_part, part_pos in _skip_quoted_parts(line): 

238 for m in _WORD_PARTS.finditer(line_part): 

239 fullword = m.group(1) 

240 if fullword.startswith("--"): 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was never true

241 # CLI arg 

242 continue 

243 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword): 

244 continue 

245 if _LOOKS_LIKE_FILENAME.match(fullword): 

246 continue 

247 if _LOOKS_LIKE_EMAIL.match(fullword): 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 continue 

249 mpos = m.span(1)[0] 

250 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword): 

251 pos, endpos = sm.span(1) 

252 offset = part_pos + mpos 

253 yield sm.group(1), pos + offset, endpos + offset 

254 

255 

256class Spellchecker: 

257 

258 @staticmethod 

259 def do_nothing_spellchecker() -> "Spellchecker": 

260 return EverythingIsCorrectSpellchecker() 

261 

262 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]: 

263 yield from _split_line_to_words(line) 

264 

265 def provide_corrections_for(self, word: str) -> Iterable[str]: 

266 raise NotImplementedError 

267 

268 def context_ignored_words(self, words: Container[str]) -> "Spellchecker": 

269 if not words: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 return self 

271 return ContextIgnoredWordsSpellchecker(self, words) 

272 

273 

274class ContextIgnoredWordsSpellchecker(Spellchecker): 

275 

276 def __init__( 

277 self, spellchecker: Spellchecker, context_ignored_words: Container[str] 

278 ) -> None: 

279 self._spellchecker = spellchecker 

280 self._context_ignored_words = context_ignored_words 

281 

282 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]: 

283 return self._spellchecker.iter_words(line) 

284 

285 def provide_corrections_for(self, word: str) -> Iterable[str]: 

286 if word.lower() in self._context_ignored_words: 

287 return _NO_CORRECTIONS 

288 return self._spellchecker.provide_corrections_for(word) 

289 

290 

291class EverythingIsCorrectSpellchecker(Spellchecker): 

292 def provide_corrections_for(self, word: str) -> Iterable[str]: 

293 return _NO_CORRECTIONS 

294 

295 def context_ignored_words(self, words: Container[str]) -> "Spellchecker": 

296 return self 

297 

298 

299class HunspellSpellchecker(Spellchecker): 

300 

301 def __init__(self) -> None: 

302 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF) 

303 for w in _builtin_exception_words(): 

304 self._checker.add(w) 

305 self._load_personal_exclusions() 

306 

307 def provide_corrections_for(self, word: str) -> Iterable[str]: 

308 if word.startswith( 308 ↛ 320line 308 didn't jump to line 320 because the condition on line 308 was never true

309 ( 

310 "dpkg-", 

311 "dh-", 

312 "dh_", 

313 "debian-", 

314 "debconf-", 

315 "update-", 

316 "DEB_", 

317 "DPKG_", 

318 ) 

319 ): 

320 return _NO_CORRECTIONS 

321 # 'ing is deliberately forcing a word into another word-class 

322 if word.endswith(("'ing", "-nss")): 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 return _NO_CORRECTIONS 

324 return self._lookup(word) 

325 

326 @functools.lru_cache(128) 

327 def _lookup(self, word: str) -> Iterable[str]: 

328 if self._checker.spell(word): 

329 return _NO_CORRECTIONS 

330 return self._checker.suggest(word) 

331 

332 def _load_personal_exclusions(self) -> None: 

333 for filename in _PERSONAL_DICTS: 

334 if filename.startswith("${"): 334 ↛ 341line 334 didn't jump to line 341 because the condition on line 334 was always true

335 end_index = filename.index("}") 

336 varname = filename[2:end_index] 

337 value = os.environ.get(varname) 

338 if value is None: 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true

339 continue 

340 filename = value + filename[end_index + 1 :] 

341 if os.path.isfile(filename): 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true

342 _info(f"Loading personal spelling dictionary from {filename}") 

343 self._checker.add_dic(filename)