2 A robot exclusion rules parser for Python by Philip Semanchuk 4 Full documentation, examples and a comparison to Python's robotparser module 6 http://NikitaTheSpider.com/python/rerp/ 8 Comments, bug reports, etc. are most welcome via email to: 11 Simple usage examples: 13 import robotexclusionrulesparser 15 rerp = robotexclusionrulesparser.RobotExclusionRulesParser() 18 rerp.fetch('http://www.example.com/robots.txt') 20 # See the documentation for expected errors 23 if rerp.is_allowed('CrunchyFrogBot', '/foo.html'): 24 print "It is OK to fetch /foo.html" 26 OR supply the contents of robots.txt yourself: 28 rerp = RobotExclusionRulesParser() 29 s = open("robots.txt").read() 32 if rerp.is_allowed('CrunchyFrogBot', '/foo.html'): 33 print "It is OK to fetch /foo.html" 35 The function is_expired() tells you if you need to fetch a fresh copy of 43 RobotExclusionRulesParser supports __unicode__() and __str()__ so you can print 44 an instance to see the its rules in robots.txt format. 46 The comments refer to MK1994, MK1996 and GYM2008. These are: 47 MK1994 = the 1994 robots.txt draft spec (http://www.robotstxt.org/orig.html) 48 MK1996 = the 1996 robots.txt draft spec (http://www.robotstxt.org/norobots-rfc.txt) 49 GYM2008 = the Google-Yahoo-Microsoft extensions announced in 2008 50 (http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360) 53 This code is released under the following BSD license -- 55 Copyright (c) 2010, Philip Semanchuk 58 Redistribution and use in source and binary forms, with or without 59 modification, are permitted provided that the following conditions are met: 60 * Redistributions of source code must retain the above copyright 61 notice, this list of conditions and the following disclaimer. 62 * Redistributions in binary form must reproduce the above copyright 63 notice, this list of conditions and the following disclaimer in the 64 documentation and/or other materials provided with the distribution. 65 * Neither the name of robotexclusionrulesparser nor the 66 names of its contributors may be used to endorse or promote products 67 derived from this software without specific prior written permission. 69 THIS SOFTWARE IS PROVIDED BY ITS CONTRIBUTORS ''AS IS'' AND ANY 70 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 71 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 72 DISCLAIMED. IN NO EVENT SHALL Philip Semanchuk BE LIABLE FOR ANY 73 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 74 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 75 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 76 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 77 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 78 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 82 PY_MAJOR_VERSION = sys.version_info[0]
84 from urlparse
import urlparse
as urllib_urlparse
85 from urlparse
import urlunparse
as urllib_urlunparse
86 from urllib
import unquote
as urllib_unquote
87 import urllib2
as urllib_request
88 import urllib2
as urllib_error
96 import email.utils
as email_utils
98 import rfc822
as email_utils
106 _end_of_line_regex = re.compile(
r"(?:\r\n)|\r|\n")
113 _directive_regex = re.compile(
"(allow|disallow|user[-]?agent|sitemap|crawl-delay):[ \t]*(.*)", re.IGNORECASE)
117 SEVEN_DAYS = 60 * 60 * 24 * 7
123 MAX_FILESIZE = 100 * 1024
126 _control_characters_regex = re.compile(
r"""[\000-\037]|\0177""")
130 _charset_extraction_regex = re.compile(
r"""charset=['"]?(?P<encoding>[^'"]*)['"]?""")
137 s +=
"error, message" if (PY_MAJOR_VERSION == 2)
else "error(message)" 146 path = re.sub(
"%2[fF]",
"\n", path)
147 path = urllib_unquote(path)
148 return path.replace(
"\n",
"%2F")
157 s = _control_characters_regex.sub(
"", s)
158 s = s.replace(
"\t",
" ")
172 header = header.strip().lower()
176 chunks = [s.strip()
for s
in header.split(
";")]
177 media_type = chunks[0]
179 for parameter
in chunks[1:]:
180 m = _charset_extraction_regex.search(parameter)
181 if m
and m.group(
"encoding"):
182 encoding = m.group(
"encoding")
184 return media_type.strip(), encoding.strip()
188 """ _Ruleset represents a set of allow/disallow rules (and possibly a 189 crawl delay) that apply to a set of user agents. 191 Users of this module don't need this class. It's available at the module 192 level only because RobotExclusionRulesParser() instances can't be 193 pickled if _Ruleset isn't visible a the module level. 205 if PY_MAJOR_VERSION == 2:
206 s = s.encode(
"utf-8")
213 s =
''.
join( [
"User-agent: %s\n" % name
for name
in self.
robot_names] )
218 s +=
''.
join( [
"%s: %s\n" % (d[rule_type], path)
for rule_type, path
in self.
rules] )
247 match = match
or (robot_name ==
'*')
or (robot_name.lower()
in user_agent.lower())
257 _, _, path, parameters, query, fragment = urllib_urlparse(url)
258 url = urllib_urlunparse((
"",
"", path, parameters, query, fragment))
265 rule_type, path = self.
rules[i]
267 if (syntax == GYM2008)
and (
"*" in path
or path.endswith(
"$")):
270 if path.endswith(
"$"):
275 parts = path.split(
"*")
276 pattern =
"%s%s" % (
".*".
join([re.escape(p)
for p
in parts]), appendix)
277 if re.match(pattern, url):
280 allowed = (rule_type == self.
ALLOW)
283 if url.startswith(path):
286 allowed = (rule_type == self.
ALLOW)
291 allowed =
not allowed
294 if i == len(self.
rules):
301 """A parser for robots.txt files.""" 314 """The URL from which this robots.txt was fetched. Read only.""" 319 """The remote server's response code. Read only.""" 324 """Deprecated; use 'sitemaps' instead. Returns the sitemap URL present 325 in the robots.txt, if any. Defaults to None. Read only.""" 326 _raise_error(DeprecationWarning,
"The sitemap property is deprecated. Use 'sitemaps' instead.")
330 """The sitemap URLs present in the robots.txt, if any. Defaults 331 to an empty list. Read only.""" 338 """True if the difference between now and the last call to fetch() 339 exceeds the robots.txt expiration. Read only. 349 return calendar.timegm(time.gmtime())
353 """True if the user agent is permitted to visit the URL. The syntax 354 parameter can be GYM2008 (the default) or MK1996 for strict adherence 355 to the traditional standard. 369 if not isinstance(user_agent, unicode):
370 user_agent = user_agent.decode()
371 if not isinstance(url, unicode):
374 if syntax
not in (MK1996, GYM2008):
375 _raise_error(ValueError,
"Syntax must be MK1996 or GYM2008")
378 if ruleset.does_user_agent_match(user_agent):
379 return ruleset.is_url_allowed(url, syntax)
385 """Returns a float representing the crawl delay specified for this 386 user agent, or None if the crawl delay was unspecified or not a float. 389 if (PY_MAJOR_VERSION < 3)
and (
not isinstance(user_agent, unicode)):
390 user_agent = user_agent.decode()
393 if ruleset.does_user_agent_match(user_agent):
394 return ruleset.crawl_delay
400 """Attempts to fetch the URL requested which should refer to a 401 robots.txt file, e.g. http://example.com/robots.txt. 407 encoding =
"iso-8859-1" 409 expires_header =
None 410 content_type_header =
None 415 req = urllib_request.Request(url,
None, {
'User-Agent' : self.
user_agent })
417 req = urllib_request.Request(url)
421 f = urllib_request.urlopen(req, timeout=timeout)
423 f = urllib_request.urlopen(req)
425 content = f.read(MAX_FILESIZE)
428 expires_header = f.info().get(
"expires")
429 content_type_header = f.info().get(
"Content-Type")
432 if hasattr(f,
"code"):
437 except urllib_error.URLError:
440 error_instance = sys.exc_info()
441 if len(error_instance) > 1:
442 error_instance = error_instance[1]
443 if hasattr(error_instance,
"code"):
497 encoding =
"iso-8859-1" 501 content =
"User-agent: *\nDisallow: /\n" 509 if((PY_MAJOR_VERSION == 2)
and isinstance(content, str))
or \
510 ((PY_MAJOR_VERSION > 2)
and (
not isinstance(content, str))):
516 content = content.decode(encoding)
519 "Robots.txt contents are not in the encoding expected (%s)." % encoding)
520 except (LookupError, ValueError):
525 _raise_error(UnicodeError,
"I don't understand the encoding \"%s\"." % encoding)
533 """Parses the passed string as a set of robots.txt rules.""" 537 if(PY_MAJOR_VERSION > 2)
and (isinstance(s, bytes)
or isinstance(s, bytearray))
or \
538 (PY_MAJOR_VERSION == 2)
and (
not isinstance(s, unicode)):
539 s = s.decode(
"iso-8859-1")
542 s = _end_of_line_regex.sub(
"\n", s)
544 lines = s.split(
"\n")
546 previous_line_was_a_user_agent =
False 547 current_ruleset =
None 552 if line
and line[0] ==
'#':
559 if i != -1: line = line[:i]
565 if current_ruleset
and current_ruleset.is_not_empty():
568 current_ruleset =
None 569 previous_line_was_a_user_agent =
False 584 matches = _directive_regex.findall(line)
588 field, data = matches[0]
589 field = field.lower()
594 if field
in (
"useragent",
"user-agent"):
595 if previous_line_was_a_user_agent:
597 if current_ruleset
and data:
598 current_ruleset.add_robot_name(data)
601 if current_ruleset
and current_ruleset.is_not_empty():
610 current_ruleset.add_robot_name(data)
612 previous_line_was_a_user_agent =
True 613 elif field ==
"allow":
614 previous_line_was_a_user_agent =
False 616 current_ruleset.add_allow_rule(data)
617 elif field ==
"sitemap":
618 previous_line_was_a_user_agent =
False 620 elif field ==
"crawl-delay":
623 previous_line_was_a_user_agent =
False 626 current_ruleset.crawl_delay = float(data)
632 previous_line_was_a_user_agent =
False 634 current_ruleset.add_disallow_rule(data)
636 if current_ruleset
and current_ruleset.is_not_empty():
645 not_defaults = [r
for r
in self.
__rulesets if not r.is_default()]
646 defaults = [r
for r
in self.
__rulesets if r.is_default()]
653 if PY_MAJOR_VERSION == 2:
654 s = s.encode(
"utf-8")
663 if PY_MAJOR_VERSION < 3:
667 stringify = (unicode
if (PY_MAJOR_VERSION == 2)
else str)
668 return s +
'\n'.
join( [stringify(ruleset)
for ruleset
in self.
__rulesets] )
672 """A drop-in replacement for the Python standard library's RobotFileParser 673 that retains all of the features of RobotExclusionRulesParser. 676 RobotExclusionRulesParser.__init__(self)
695 RobotExclusionRulesParser.parse(self,
''.
join(lines))
699 return RobotExclusionRulesParser.is_allowed(self, user_agent, url, syntax)
def _raise_error(error, message)
def is_allowed(self, user_agent, url, syntax=GYM2008)
def get_crawl_delay(self, user_agent)
def can_fetch(self, user_agent, url, syntax=GYM2008)
def is_url_allowed(self, url, syntax=GYM2008)
def add_robot_name(self, bot)
def add_disallow_rule(self, path)
def add_allow_rule(self, path)
def __init__(self, url="")
def fetch(self, url, timeout=None)
def _parse_content_type_header(header)
def does_user_agent_match(self, user_agent)