Coverage for src/debputy/lsp/spellchecking.py: 83%

1import functools

2import importlib.resources

3import itertools

4import os

5import re

6import subprocess

7from importlib.resources.abc import Traversable

8from typing import Iterable, FrozenSet, Tuple, Optional, Container, TYPE_CHECKING

10from debian.debian_support import Release

11from debputy.lsp.quickfixes import propose_correct_text_quick_fix

12from debputy.util import _info, _warn

13import debputy.lsp.data.wordlists as data_wordlist

15try:

16 from debputy.lsp.vendoring._deb822_repro.locatable import (

17 Position as TEPosition,

18 Range as TERange,

19 )

20except ImportError:

21 pass

23if TYPE_CHECKING:

24 from debputy.linting.lint_util import LintState

27_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"

28_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"

29_WORD_PARTS = re.compile(r"(\S+)")

30_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")

31_FIND_QUOTE_CHAR = re.compile(r'["`]')

32_LOOKS_LIKE_FILENAME = re.compile(

33 r"""

34 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*

35 | [a-z0-9-_]+(/[a-z0-9]+)+/*

36 | [a-z0-9_]+(/[a-z0-9_]+){2,}/*

37 | (?:\S+)?[.][a-z]{1,3}

39""",

40 re.VERBOSE,

41)

42_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(

43 r"""

44 (

45 # Java identifier Camel Case

46 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+

47 # Type name Camel Case

48 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+

49 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other

50 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+

51 # Perl module

52 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+

53 # Probably an abbreviation

54 | [A-Z]{3,}

55 # Perl/Python identifiers or Jinja templates

56 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?

57 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)

58 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?

59 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#

60 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words

61 | [a-z][a-z0-9]*(-[a-z0-9]+){2,}

62 # Short args

63 | -[a-z0-9]+

64 # Things like 32bit

65 | \d{2,}-?[a-z]+

66 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)

67 | src:[a-z0-9][-+.a-z0-9]+

68 | [a-z0-9][-+.a-z0-9]+:(?:any|native)

69 # Version

70 | v\d+(?:[.]\S+)?

71 # chmod symbolic mode or math

72 | \S*=\S+

73 )

74""",

75 re.VERBOSE,

76)

77_LOOKS_LIKE_EMAIL = re.compile(

78 r"""

79 <[^>@\s]+@[^>@\s]+>

80""",

81 re.VERBOSE,

82)

83_NO_CORRECTIONS = tuple()

84_WORDLISTS = [

85 "debian-wordlist.dic",

86]

87_NAMELISTS = [

88 "logins-and-people.dic",

89]

90_PERSONAL_DICTS = [

91 "${HOME}/.hunspell_default",

92 "${HOME}/.hunspell_en_US",

93]

96try:

97 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was never true

98 _SPELL_CHECKER_AFF

99 ):

100 raise ImportError

101 from hunspell import HunSpell

102

103 _HAS_HUNSPELL = True

104except ImportError:

105 _HAS_HUNSPELL = False

106

107

108def _read_wordlist(

109 base_dir: Traversable,

110 wordlist_name: str,

111 *,

112 namelist: bool = False,

113) -> Iterable[str]:

114 path = base_dir.joinpath(wordlist_name)

115 with path.open("r", encoding="utf-8") as fd:

116 w = [w.strip() for w in fd]

117 yield from w

118 if namelist:

119 yield from (f"{n}'s" for n in w)

120

121

122def _all_debian_archs() -> Iterable[str]:

123 try:

124 output = subprocess.check_output(["dpkg-architecture", "-L"])

125 except (FileNotFoundError, subprocess.CalledProcessError) as e:

126 _warn(f"dpkg-architecture -L failed: {e}")

127 return tuple()

128

129 return (x.strip() for x in output.decode("utf-8").splitlines())

130

131

132@functools.lru_cache

133def _builtin_exception_words() -> FrozenSet[str]:

134 release_names = (x for x in Release.releases)

135 data_dir = importlib.resources.files(data_wordlist.__name__)

136 return frozenset(

137 itertools.chain(

138 itertools.chain.from_iterable(

139 _read_wordlist(data_dir, wl) for wl in _WORDLISTS

140 ),

141 itertools.chain.from_iterable(

142 _read_wordlist(data_dir, wl, namelist=True) for wl in _NAMELISTS

143 ),

144 release_names,

145 _all_debian_archs(),

146 )

147 )

148

149

150_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None

151

152

153def spellcheck_line(

154 lint_state: "LintState",

155 line_no: int,

156 line: str,

157) -> int:

158 spell_checker = lint_state.spellchecker()

159 typos = 0

160 for word, pos, endpos in spell_checker.iter_words(line):

161 corrections = spell_checker.provide_corrections_for(word)

162 if not corrections: 162 ↛ 164line 162 didn't jump to line 164 because the condition on line 162 was always true

163 continue

164 word_range = TERange(

165 TEPosition(line_no, pos),

166 TEPosition(line_no, endpos),

167 )

168 typos += 1

169 lint_state.emit_diagnostic(

170 word_range,

171 f'Spelling "{word}"',

172 "spelling",

173 "debputy",

174 quickfixes=[propose_correct_text_quick_fix(c) for c in corrections],

175 enable_non_interactive_auto_fix=False,

176 )

177 return typos

178

179

180def default_spellchecker() -> "Spellchecker":

181 global _DEFAULT_SPELL_CHECKER

182 spellchecker = _DEFAULT_SPELL_CHECKER

183 if spellchecker is None: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 if _HAS_HUNSPELL:

185 spellchecker = HunspellSpellchecker()

186 else:

187 spellchecker = _do_nothing_spellchecker()

188 _DEFAULT_SPELL_CHECKER = spellchecker

189 return spellchecker

190

191

192@functools.lru_cache()

193def _do_nothing_spellchecker() -> "Spellchecker":

194 return EverythingIsCorrectSpellchecker()

195

196

197def disable_spellchecking() -> None:

198 global _DEFAULT_SPELL_CHECKER

199 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()

200

201

202def _testing_set_default_spellchecker(

203 spellchecker: Optional["Spellchecker"],

204) -> Optional["Spellchecker"]:

205 global _DEFAULT_SPELL_CHECKER

206 previous = _DEFAULT_SPELL_CHECKER

207 _DEFAULT_SPELL_CHECKER = spellchecker

208 return previous

209

210

211def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]:

212 current_pos = 0

213 while True:

214 try:

215 m = _FIND_QUOTE_CHAR.search(line, current_pos)

216 if m is None:

217 if current_pos == 0:

218 yield line, 0

219 else:

220 yield line[current_pos:], current_pos

221 return

222 starting_marker_pos = m.span()[0]

223 quote_char = m.group()

224 end_marker_pos = line.index(quote_char, starting_marker_pos + 1)

225 except ValueError:

226 yield line[current_pos:], current_pos

227 return

228

229 part = line[current_pos:starting_marker_pos]

230

231 if not part.isspace(): 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true

232 yield part, current_pos

233 current_pos = end_marker_pos + 1

234

235

236def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]:

237 for line_part, part_pos in _skip_quoted_parts(line):

238 for m in _WORD_PARTS.finditer(line_part):

239 fullword = m.group(1)

240 if fullword.startswith("--"): 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was never true

241 # CLI arg

242 continue

243 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword):

244 continue

245 if _LOOKS_LIKE_FILENAME.match(fullword):

246 continue

247 if _LOOKS_LIKE_EMAIL.match(fullword): 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 continue

249 mpos = m.span(1)[0]

250 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):

251 pos, endpos = sm.span(1)

252 offset = part_pos + mpos

253 yield sm.group(1), pos + offset, endpos + offset

254

255

256class Spellchecker:

257

258 @staticmethod

259 def do_nothing_spellchecker() -> "Spellchecker":

260 return EverythingIsCorrectSpellchecker()

261

262 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:

263 yield from _split_line_to_words(line)

264

265 def provide_corrections_for(self, word: str) -> Iterable[str]:

266 raise NotImplementedError

267

268 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":

269 if not words: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 return self

271 return ContextIgnoredWordsSpellchecker(self, words)

272

273

274class ContextIgnoredWordsSpellchecker(Spellchecker):

275

276 def __init__(

277 self, spellchecker: Spellchecker, context_ignored_words: Container[str]

278 ) -> None:

279 self._spellchecker = spellchecker

280 self._context_ignored_words = context_ignored_words

281

282 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:

283 return self._spellchecker.iter_words(line)

284

285 def provide_corrections_for(self, word: str) -> Iterable[str]:

286 if word.lower() in self._context_ignored_words:

287 return _NO_CORRECTIONS

288 return self._spellchecker.provide_corrections_for(word)

289

290

291class EverythingIsCorrectSpellchecker(Spellchecker):

292 def provide_corrections_for(self, word: str) -> Iterable[str]:

293 return _NO_CORRECTIONS

294

295 def context_ignored_words(self, words: Container[str]) -> "Spellchecker":

296 return self

297

298

299class HunspellSpellchecker(Spellchecker):

300

301 def __init__(self) -> None:

302 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)

303 for w in _builtin_exception_words():

304 self._checker.add(w)

305 self._load_personal_exclusions()

306

307 def provide_corrections_for(self, word: str) -> Iterable[str]:

308 if word.startswith( 308 ↛ 320line 308 didn't jump to line 320 because the condition on line 308 was never true

309 (

310 "dpkg-",

311 "dh-",

312 "dh_",

313 "debian-",

314 "debconf-",

315 "update-",

316 "DEB_",

317 "DPKG_",

318 )

319 ):

320 return _NO_CORRECTIONS

321 # 'ing is deliberately forcing a word into another word-class

322 if word.endswith(("'ing", "-nss")): 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 return _NO_CORRECTIONS

324 return self._lookup(word)

325

326 @functools.lru_cache(128)

327 def _lookup(self, word: str) -> Iterable[str]:

328 if self._checker.spell(word):

329 return _NO_CORRECTIONS

330 return self._checker.suggest(word)

331

332 def _load_personal_exclusions(self) -> None:

333 for filename in _PERSONAL_DICTS:

334 if filename.startswith("${"): 334 ↛ 341line 334 didn't jump to line 341 because the condition on line 334 was always true

335 end_index = filename.index("}")

336 varname = filename[2:end_index]

337 value = os.environ.get(varname)

338 if value is None: 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true

339 continue

340 filename = value + filename[end_index + 1 :]

341 if os.path.isfile(filename): 341 ↛ 342line 341 didn't jump to line 342 because the condition on line 341 was never true

342 _info(f"Loading personal spelling dictionary from {filename}")

343 self._checker.add_dic(filename)