Skip to content

Commit d41eceb

Browse files
authored
Merge pull request #231 from dhellmann/pypi-filter-factory-json
use json api to retrieve names from pypi.org
2 parents 65a9b84 + 2eafb07 commit d41eceb

File tree

4 files changed

+82
-70
lines changed

4 files changed

+82
-70
lines changed

docs/source/history.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ Bug Fixes
3535

3636
- `#229 <https://github.com/sphinx-contrib/spelling/pull/229>`__ Gracefully
3737
handle if git is not installed
38+
- `#227 <https://github.com/sphinx-contrib/spelling/pull/229>`__ Use pypi.org's
39+
JSON API instead of XML-RPC.
3840

3941
7.7.0
4042
=====

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ classifiers = [
3030

3131
requires-python = ">=3.10"
3232

33-
dependencies = ["PyEnchant>=3.1.1", "Sphinx>=3.0.0"]
33+
dependencies = ["PyEnchant>=3.1.1", "Sphinx>=3.0.0", "requests>=2.32.3"]
3434

3535
[project.optional-dependencies]
3636
test = ["pytest", "pytest-cov", "coverage!=4.4,>=4.0"]

sphinxcontrib/spelling/filters.py

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
#
22
# Copyright (c) 2010 Doug Hellmann. All rights reserved.
33
#
4-
"""Spelling checker extension for Sphinx.
5-
"""
4+
"""Spelling checker extension for Sphinx."""
65

76
# TODO - Words with multiple uppercase letters treated as classes and ignored
87

98
import builtins
109
import importlib
1110
import subprocess
1211
import sys
13-
from xmlrpc import client as xmlrpc_client
1412

13+
import requests
1514
from enchant.tokenize import Filter, get_tokenizer, tokenize, unit_tokenize
1615
from sphinx.util import logging
1716

@@ -22,18 +21,19 @@ class AcronymFilter(Filter):
2221
"""If a word looks like an acronym (all upper case letters),
2322
ignore it.
2423
"""
24+
2525
def _skip(self, word):
2626
return (
27-
word.isupper() or # all caps
27+
word.isupper() # all caps
28+
or
2829
# pluralized acronym ("URLs")
29-
(word[-1].lower() == 's' and word[:-1].isupper())
30+
(word[-1].lower() == "s" and word[:-1].isupper())
3031
)
3132

3233

3334
class list_tokenize(tokenize):
34-
3535
def __init__(self, words):
36-
super().__init__('')
36+
super().__init__("")
3737
self._words = words
3838

3939
def next(self):
@@ -44,8 +44,8 @@ def next(self):
4444

4545

4646
class ContractionFilter(Filter):
47-
"""Strip common contractions from words.
48-
"""
47+
"""Strip common contractions from words."""
48+
4949
splits = {
5050
"aren't": ["are", "not"],
5151
"can't": ["can", "not"],
@@ -138,8 +138,7 @@ def _split(self, word):
138138

139139

140140
class IgnoreWordsFilter(Filter):
141-
"""Given a set of words, ignore them all.
142-
"""
141+
"""Given a set of words, ignore them all."""
143142

144143
def __init__(self, tokenizer, word_set):
145144
self.word_set = set(word_set)
@@ -150,7 +149,6 @@ def _skip(self, word):
150149

151150

152151
class IgnoreWordsFilterFactory:
153-
154152
def __init__(self, words):
155153
self.words = words
156154

@@ -159,23 +157,31 @@ def __call__(self, tokenizer):
159157

160158

161159
class PyPIFilterFactory(IgnoreWordsFilterFactory):
162-
"""Build an IgnoreWordsFilter for all of the names of packages on PyPI.
163-
"""
160+
"""Build an IgnoreWordsFilter for all of the names of packages on PyPI."""
161+
164162
def __init__(self):
165-
client = xmlrpc_client.ServerProxy('https://pypi.python.org/pypi')
166-
super().__init__(client.list_packages())
163+
r = requests.get(
164+
"https://pypi.org/simple/",
165+
headers={
166+
"user-agent": "sphinxcontrib.spelling",
167+
"accept": "application/vnd.pypi.simple.v1+json",
168+
},
169+
)
170+
names = [i["name"] for i in r.json()["projects"]]
171+
logger.debug("retrieved %d project names from pypi.org", len(names))
172+
super().__init__(names)
167173

168174

169175
class PythonBuiltinsFilter(Filter):
170-
"""Ignore names of built-in Python symbols.
171-
"""
176+
"""Ignore names of built-in Python symbols."""
177+
172178
def _skip(self, word):
173179
return hasattr(builtins, word)
174180

175181

176182
class ImportableModuleFilter(Filter):
177-
"""Ignore names of modules that we could import.
178-
"""
183+
"""Ignore names of modules that we could import."""
184+
179185
def __init__(self, tokenizer):
180186
super().__init__(tokenizer)
181187
self.found_modules = set(sys.builtin_module_names)
@@ -185,7 +191,7 @@ def __init__(self, tokenizer):
185191
# valid module, which is consistent with the behavior before
186192
# version 7.3.1. See
187193
# https://github.com/sphinx-contrib/spelling/issues/141
188-
self.sought_modules.add('__main__')
194+
self.sought_modules.add("__main__")
189195

190196
def _skip(self, word):
191197
# If the word looks like a python module filename, strip the
@@ -195,13 +201,13 @@ def _skip(self, word):
195201
# it look like Sphinx is complaining about a commandline
196202
# argument. See
197203
# https://github.com/sphinx-contrib/spelling/issues/142
198-
if word.endswith('.py'):
204+
if word.endswith(".py"):
199205
logger.debug(
200-
'removing .py extension from %r before searching for module',
201-
word)
206+
"removing .py extension from %r before searching for module", word
207+
)
202208
word = word[:-3]
203209

204-
valid_module_name = all(n.isidentifier() for n in word.split('.'))
210+
valid_module_name = all(n.isidentifier() for n in word.split("."))
205211
if not valid_module_name:
206212
return False
207213

@@ -214,8 +220,7 @@ def _skip(self, word):
214220
# error out of distutils, or something else triggered
215221
# by failing to be able to import a parent package to
216222
# use the metadata to search for a subpackage.
217-
logger.debug('find_spec(%r) failed, invalid module name: %s',
218-
word, err)
223+
logger.debug("find_spec(%r) failed, invalid module name: %s", word, err)
219224
else:
220225
if mod is not None:
221226
self.found_modules.add(word)
@@ -230,25 +235,28 @@ class ContributorFilter(IgnoreWordsFilter):
230235
tokens that are in the set.
231236
"""
232237

233-
_pretty_format = (
234-
'%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn'
235-
)
238+
_pretty_format = "%(trailers:key=Co-Authored-By,separator=%x0A)%x0A%an%x0A%cn"
236239

237240
def __init__(self, tokenizer):
238241
contributors = self._get_contributors()
239242
super().__init__(tokenizer, contributors)
240243

241244
def _get_contributors(self):
242-
logger.info('Scanning contributors')
243-
cmd = ['git', 'log', '--quiet', '--no-color',
244-
f'--pretty=format:{self._pretty_format}']
245+
logger.info("Scanning contributors")
246+
cmd = [
247+
"git",
248+
"log",
249+
"--quiet",
250+
"--no-color",
251+
f"--pretty=format:{self._pretty_format}",
252+
]
245253

246254
try:
247255
p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE)
248256
except (subprocess.CalledProcessError, FileNotFoundError) as err:
249-
logger.warning('Called: %s', ' '.join(cmd))
250-
logger.warning('Failed to scan contributors: %s', err)
257+
logger.warning("Called: %s", " ".join(cmd))
258+
logger.warning("Failed to scan contributors: %s", err)
251259
return set()
252-
output = p.stdout.decode('utf-8')
253-
tokenizer = get_tokenizer('en_US', filters=[])
260+
output = p.stdout.decode("utf-8")
261+
tokenizer = get_tokenizer("en_US", filters=[])
254262
return {word for word, pos in tokenizer(output)}

tests/test_filter.py

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#
22
# Copyright (c) 2010 Doug Hellmann. All rights reserved.
33
#
4-
"""Tests for filters.
5-
"""
4+
"""Tests for filters."""
65

76
import contextlib
87
import logging
@@ -12,38 +11,38 @@
1211
import pytest
1312
from enchant.tokenize import get_tokenizer
1413

15-
from sphinxcontrib.spelling import filters # isort:skip
16-
from tests import helpers # isort:skip
14+
from sphinxcontrib.spelling import filters # isort:skip
15+
from tests import helpers # isort:skip
1716

1817
# Replace the sphinx logger with a normal one so pytest can collect
1918
# the output.
20-
filters.logger = logging.getLogger('test.filters')
19+
filters.logger = logging.getLogger("test.filters")
2120

2221

2322
def test_builtin_unicode():
2423
f = filters.PythonBuiltinsFilter(None)
25-
assert not f._skip('passé')
24+
assert not f._skip("passé")
2625

2726

2827
def test_builtin_regular():
2928
f = filters.PythonBuiltinsFilter(None)
30-
assert f._skip('print')
29+
assert f._skip("print")
3130

3231

3332
def test_acronym():
34-
text = 'a front-end for DBM-style databases'
35-
t = get_tokenizer('en_US', [])
33+
text = "a front-end for DBM-style databases"
34+
t = get_tokenizer("en_US", [])
3635
f = filters.AcronymFilter(t)
3736
words = [w[0] for w in f(text)]
38-
assert 'DBM' not in words, 'Failed to filter out acronym'
37+
assert "DBM" not in words, "Failed to filter out acronym"
3938

4039

4140
def test_acronym_unicode():
42-
text = 'a front-end for DBM-style databases'
43-
t = get_tokenizer('en_US', [])
41+
text = "a front-end for DBM-style databases"
42+
t = get_tokenizer("en_US", [])
4443
f = filters.AcronymFilter(t)
4544
words = [w[0] for w in f(text)]
46-
assert 'DBM' not in words, 'Failed to filter out acronym'
45+
assert "DBM" not in words, "Failed to filter out acronym"
4746

4847

4948
@helpers.require_git_repo
@@ -77,7 +76,7 @@ def test_acronym_unicode():
7776
"Timotheus",
7877
"Tobias",
7978
"Tricoli",
80-
]
79+
],
8180
)
8281
def test_contributors(name):
8382
f = filters.ContributorFilter(None)
@@ -87,11 +86,11 @@ def test_contributors(name):
8786
@pytest.mark.parametrize(
8887
"word,expected",
8988
[
90-
('os', True),
91-
('os.name', False),
92-
('__main__', False),
89+
("os", True),
90+
("os.name", False),
91+
("__main__", False),
9392
("don't", False),
94-
]
93+
],
9594
)
9695
def test_importable_module_skip(word, expected):
9796
f = filters.ImportableModuleFilter(None)
@@ -110,42 +109,45 @@ def import_path(new_path):
110109

111110

112111
def test_importable_module_with_side_effets(tmpdir):
113-
logging.debug('tmpdir %r', tmpdir)
114-
logging.debug('cwd %r', os.getcwd())
112+
logging.debug("tmpdir %r", tmpdir)
113+
logging.debug("cwd %r", os.getcwd())
115114

116-
parentdir = tmpdir.join('parent')
115+
parentdir = tmpdir.join("parent")
117116
parentdir.mkdir()
118117

119-
parentdir.join('__init__.py').write(
120-
'raise SystemExit("exit as side-effect")\n'
121-
)
122-
parentdir.join('child.py').write('')
118+
parentdir.join("__init__.py").write('raise SystemExit("exit as side-effect")\n')
119+
parentdir.join("child.py").write("")
123120

124121
with import_path([str(tmpdir)] + sys.path):
125122
f = filters.ImportableModuleFilter(None)
126-
skip_parent = f._skip('parent')
127-
skip_both = f._skip('parent.child')
123+
skip_parent = f._skip("parent")
124+
skip_both = f._skip("parent.child")
128125

129126
# The parent module name is valid because it is not imported, only
130127
# discovered.
131128
assert skip_parent is True
132-
assert 'parent' in f.found_modules
129+
assert "parent" in f.found_modules
133130

134131
# The child module name is not valid because the parent is
135132
# imported to find the child and that triggers the side-effect.
136133
assert skip_both is False
137-
assert 'parent.child' not in f.found_modules
134+
assert "parent.child" not in f.found_modules
138135

139136

140137
def test_importable_module_with_system_exit(tmpdir):
141-
path = tmpdir.join('mytestmodule.py')
138+
path = tmpdir.join("mytestmodule.py")
142139
path.write('raise SystemExit("exit as side-effect")\n')
143140

144141
with import_path([str(tmpdir)] + sys.path):
145142
f = filters.ImportableModuleFilter(None)
146-
skip = f._skip('mytestmodule')
143+
skip = f._skip("mytestmodule")
147144

148145
# The filter does not actually import the module in this case, so
149146
# it shows up as a valid word.
150147
assert skip is True
151-
assert 'mytestmodule' in f.found_modules
148+
assert "mytestmodule" in f.found_modules
149+
150+
151+
def test_pypi_filter_factory():
152+
f = filters.PyPIFilterFactory()
153+
assert "sphinxcontrib-spelling" in f.words

0 commit comments

Comments
 (0)