Skip to content

Commit 5db5c06

Browse files
mcscopened-deily
authored andcommitted
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
1 parent 7a1c027 commit 5db5c06

File tree

5 files changed

+47
-0
lines changed

5 files changed

+47
-0
lines changed

Doc/library/urllib.robotparser.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
7676

7777
.. versionadded:: 3.6
7878

79+
.. method:: site_maps()
80+
81+
Returns the contents of the ``Sitemap`` parameter from
82+
``robots.txt`` in the form of a :func:`list`. If there is no such
83+
parameter or the ``robots.txt`` entry for this parameter has
84+
invalid syntax, return ``None``.
85+
86+
.. versionadded:: 3.8
87+
7988

8089
The following example demonstrates basic use of the :class:`RobotFileParser`
8190
class::

Lib/test/test_robotparser.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class BaseRobotTest:
1212
agent = 'test_robotparser'
1313
good = []
1414
bad = []
15+
site_maps = None
1516

1617
def setUp(self):
1718
lines = io.StringIO(self.robots_txt).readlines()
@@ -36,6 +37,9 @@ def test_bad_urls(self):
3637
with self.subTest(url=url, agent=agent):
3738
self.assertFalse(self.parser.can_fetch(agent, url))
3839

40+
def test_site_maps(self):
41+
self.assertEqual(self.parser.site_maps(), self.site_maps)
42+
3943

4044
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
4145
robots_txt = """\
@@ -65,6 +69,23 @@ class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
6569
bad = ['/cyberworld/map/index.html']
6670

6771

72+
class SitemapTest(BaseRobotTest, unittest.TestCase):
73+
robots_txt = """\
74+
# robots.txt for http://www.example.com/
75+
76+
User-agent: *
77+
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
78+
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
79+
Request-rate: 3/15
80+
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
81+
82+
"""
83+
good = ['/', '/test.html']
84+
bad = ['/cyberworld/map/index.html']
85+
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
86+
'http://www.google.com/hostednews/sitemap_index.xml']
87+
88+
6889
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
6990
robots_txt = """\
7091
# go away

Lib/urllib/robotparser.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class RobotFileParser:
2727

2828
def __init__(self, url=''):
2929
self.entries = []
30+
self.sitemaps = []
3031
self.default_entry = None
3132
self.disallow_all = False
3233
self.allow_all = False
@@ -141,6 +142,12 @@ def parse(self, lines):
141142
and numbers[1].strip().isdigit()):
142143
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
143144
state = 2
145+
elif line[0] == "sitemap":
146+
# According to http://www.sitemaps.org/protocol.html
147+
# "This directive is independent of the user-agent line,
148+
# so it doesn't matter where you place it in your file."
149+
# Therefore we do not change the state of the parser.
150+
self.sitemaps.append(line[1])
144151
if state == 2:
145152
self._add_entry(entry)
146153

@@ -189,6 +196,11 @@ def request_rate(self, useragent):
189196
return entry.req_rate
190197
return self.default_entry.req_rate
191198

199+
def site_maps(self):
200+
if not self.sitemaps:
201+
return None
202+
return self.sitemaps
203+
192204
def __str__(self):
193205
entries = self.entries
194206
if self.default_entry is not None:

Misc/ACKS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ Anthony Baxter
109109
Mike Bayer
110110
Samuel L. Bayer
111111
Bo Bayles
112+
Christopher Beacham AKA Lady Red
112113
Tommy Beadle
113114
Donald Beaudry
114115
David Beazley
@@ -1760,6 +1761,7 @@ Dik Winter
17601761
Blake Winton
17611762
Jean-Claude Wippler
17621763
Stéphane Wirtel
1764+
Peter Wirtz
17631765
Lars Wirzenius
17641766
John Wiseman
17651767
Chris Withers
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Added support for Site Maps to urllib's ``RobotFileParser`` as
2+
:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
3+
Patch by Lady Red, based on patch by Peter Wirtz.

0 commit comments

Comments
 (0)