Skip to content

Commit 3936fd7

Browse files
miss-islingtonmichael-lazar
authored andcommitted
[3.7] bpo-32861: urllib.robotparser fix incomplete __str__ methods. (GH-5711) (GH-6795) (GH-6818)
The urllib.robotparser's __str__ representation now includes wildcard entries and the "Crawl-delay" and "Request-rate" fields. (cherry picked from commit c3fa1f2) Co-authored-by: Michael Lazar <[email protected]>
1 parent 914bad6 commit 3936fd7

File tree

3 files changed

+43
-5
lines changed

3 files changed

+43
-5
lines changed

Lib/test/test_robotparser.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,33 @@ class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
249249
bad = ['/cyberworld/map/index.html']
250250

251251

252+
class StringFormattingTest(BaseRobotTest, unittest.TestCase):
253+
robots_txt = """\
254+
User-agent: *
255+
Crawl-delay: 1
256+
Request-rate: 3/15
257+
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
258+
259+
# Cybermapper knows where to go.
260+
User-agent: cybermapper
261+
Disallow: /some/path
262+
"""
263+
264+
expected_output = """\
265+
User-agent: cybermapper
266+
Disallow: /some/path
267+
268+
User-agent: *
269+
Crawl-delay: 1
270+
Request-rate: 3/15
271+
Disallow: /cyberworld/map/
272+
273+
"""
274+
275+
def test_string_formatting(self):
276+
self.assertEqual(str(self.parser), self.expected_output)
277+
278+
252279
class RobotHandler(BaseHTTPRequestHandler):
253280

254281
def do_GET(self):

Lib/urllib/robotparser.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,10 @@ def request_rate(self, useragent):
190190
return self.default_entry.req_rate
191191

192192
def __str__(self):
193-
return ''.join([str(entry) + "\n" for entry in self.entries])
193+
entries = self.entries
194+
if self.default_entry is not None:
195+
entries = entries + [self.default_entry]
196+
return '\n'.join(map(str, entries)) + '\n'
194197

195198

196199
class RuleLine:
@@ -222,10 +225,15 @@ def __init__(self):
222225
def __str__(self):
223226
ret = []
224227
for agent in self.useragents:
225-
ret.extend(["User-agent: ", agent, "\n"])
226-
for line in self.rulelines:
227-
ret.extend([str(line), "\n"])
228-
return ''.join(ret)
228+
ret.append(f"User-agent: {agent}")
229+
if self.delay is not None:
230+
ret.append(f"Crawl-delay: {self.delay}")
231+
if self.req_rate is not None:
232+
rate = self.req_rate
233+
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
234+
ret.extend(map(str, self.rulelines))
235+
ret.append('') # for compatibility
236+
return '\n'.join(ret)
229237

230238
def applies_to(self, useragent):
231239
"""check if this entry applies to the specified agent"""
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The urllib.robotparser's ``__str__`` representation now includes wildcard
2+
entries and the "Crawl-delay" and "Request-rate" fields. Patch by
3+
Michael Lazar.

0 commit comments

Comments
 (0)