Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,8 @@ class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
expected_output = """\
User-agent: spambot
User-agent: eggsbot
Disallow: /some/path
Disallow: /another/path\
Disallow: /another/path
Disallow: /some/path\
"""


Expand Down Expand Up @@ -445,10 +445,10 @@ class WeirdPathTest(BaseRobotTest, unittest.TestCase):
'/e$$', '/ex$y$', '/g']
expected_output = """\
User-agent: *
Disallow: /a$
Disallow: /c*
Disallow: /d*z
Disallow: /e*$
Disallow: /a$
Disallow: /g$\
"""

Expand Down
15 changes: 15 additions & 0 deletions Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def _add_entry(self, entry):
self.groups[agent] = entry
else:
self.groups[agent] = merge_entries(self.groups[agent], entry)
sort_rulelines(self.groups[agent].rulelines)

def parse(self, lines):
"""Parse the input lines from a robots.txt file.
Expand Down Expand Up @@ -305,6 +306,9 @@ def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL encoded
- rules are sorted:
- wildcards before literal paths
- literal paths from longest to shortest, "Allow" before "Disallow"
"""
best_match = -1
allowance = True
Expand All @@ -316,6 +320,9 @@ def allowance(self, filename):
allowance = line.allowance
elif m == best_match and not allowance:
allowance = line.allowance
# Optimization. Requires rules to be sorted.
if line.matcher is None and (m or len(line.path) + 1 < best_match):
break
return allowance


Expand Down Expand Up @@ -353,3 +360,11 @@ def merge_entries(e1, e2):
entry.delay = e1.delay if e2.delay is None else e2.delay
entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate
return entry

def sort_rulelines(rulelines):
def sortkey(line):
if line.matcher is not None:
return (True,)
else:
return (False, len(line.path), line.allowance)
rulelines.sort(key=sortkey, reverse=True)
Loading