HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
OwnRobots.py
Go to the documentation of this file.
1 """
2 A robot exclusion rules parser for Python by Philip Semanchuk
3 
4 Full documentation, examples and a comparison to Python's robotparser module
5 reside here:
6 http://NikitaTheSpider.com/python/rerp/
7 
8 Comments, bug reports, etc. are most welcome via email to:
9  philip@semanchuk.com
10 
11 Simple usage examples:
12 
13  import robotexclusionrulesparser
14 
15  rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
16 
17  try:
18  rerp.fetch('http://www.example.com/robots.txt')
19  except:
20  # See the documentation for expected errors
21  pass
22 
23  if rerp.is_allowed('CrunchyFrogBot', '/foo.html'):
24  print "It is OK to fetch /foo.html"
25 
26 OR supply the contents of robots.txt yourself:
27 
28  rerp = RobotExclusionRulesParser()
29  s = open("robots.txt").read()
30  rerp.parse(s)
31 
32  if rerp.is_allowed('CrunchyFrogBot', '/foo.html'):
33  print "It is OK to fetch /foo.html"
34 
35 The function is_expired() tells you if you need to fetch a fresh copy of
36 this robots.txt.
37 
38  if rerp.is_expired():
39  # Get a new copy
40  pass
41 
42 
43 RobotExclusionRulesParser supports __unicode__() and __str()__ so you can print
44 an instance to see the its rules in robots.txt format.
45 
46 The comments refer to MK1994, MK1996 and GYM2008. These are:
47 MK1994 = the 1994 robots.txt draft spec (http://www.robotstxt.org/orig.html)
48 MK1996 = the 1996 robots.txt draft spec (http://www.robotstxt.org/norobots-rfc.txt)
49 GYM2008 = the Google-Yahoo-Microsoft extensions announced in 2008
50 (http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360)
51 
52 
53 This code is released under the following BSD license --
54 
55 Copyright (c) 2010, Philip Semanchuk
56 All rights reserved.
57 
58 Redistribution and use in source and binary forms, with or without
59 modification, are permitted provided that the following conditions are met:
60  * Redistributions of source code must retain the above copyright
61  notice, this list of conditions and the following disclaimer.
62  * Redistributions in binary form must reproduce the above copyright
63  notice, this list of conditions and the following disclaimer in the
64  documentation and/or other materials provided with the distribution.
65  * Neither the name of robotexclusionrulesparser nor the
66  names of its contributors may be used to endorse or promote products
67  derived from this software without specific prior written permission.
68 
69 THIS SOFTWARE IS PROVIDED BY ITS CONTRIBUTORS ''AS IS'' AND ANY
70 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
71 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
72 DISCLAIMED. IN NO EVENT SHALL Philip Semanchuk BE LIABLE FOR ANY
73 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
74 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
75 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
76 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
77 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
78 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79 """
80 
81 import sys
82 PY_MAJOR_VERSION = sys.version_info[0]
83 
84 from urlparse import urlparse as urllib_urlparse
85 from urlparse import urlunparse as urllib_urlunparse
86 from urllib import unquote as urllib_unquote
87 import urllib2 as urllib_request
88 import urllib2 as urllib_error
89 
90 import re
91 import time
92 import calendar
93 # rfc822 is deprecated since Python 2.3, but the functions I need from it
94 # are in email.utils which isn't present until Python 2.5. ???
95 try:
96  import email.utils as email_utils
97 except ImportError:
98  import rfc822 as email_utils
99 
100 
101 # These are the different robots.txt syntaxes that this module understands.
102 # Hopefully this list will never have more than two elements.
103 MK1996 = 1
104 GYM2008 = 2
105 
106 _end_of_line_regex = re.compile(r"(?:\r\n)|\r|\n")
107 
108 # This regex is a little more generous than the spec because it accepts
109 # "User-agent" or "Useragent" (without a dash). MK1994/96 permits only the
110 # former. The regex also doesn't insist that "useragent" is at the exact
111 # beginning of the line, which makes this code immune to confusion caused
112 # by byte order markers.
113 _directive_regex = re.compile("(allow|disallow|user[-]?agent|sitemap|crawl-delay):[ \t]*(.*)", re.IGNORECASE)
114 
115 # This is the number of seconds in a week that I use to determine the default
116 # expiration date defined in MK1996.
117 SEVEN_DAYS = 60 * 60 * 24 * 7
118 
119 # This controls the max number of bytes read in as a robots.txt file. This
120 # is just a bit of defensive programming in case someone accidentally sends
121 # an ISO file in place of their robots.txt. (It happens...) Suggested by
122 # Dima Brodsky.
123 MAX_FILESIZE = 100 * 1024 # 100k
124 
125 # Control characters are everything < 0x20 and 0x7f.
126 _control_characters_regex = re.compile(r"""[\000-\037]|\0177""")
127 
128 # Charset extraction regex for pulling the encoding (charset) out of a
129 # content-type header.
130 _charset_extraction_regex = re.compile(r"""charset=['"]?(?P<encoding>[^'"]*)['"]?""")
131 
132 
133 def _raise_error(error, message):
134  # I have to exec() this code because the Python 2 syntax is invalid
135  # under Python 3 and vice-versa.
136  s = "raise "
137  s += "error, message" if (PY_MAJOR_VERSION == 2) else "error(message)"
138 
139  exec(s)
140 
141 
142 def _unquote_path(path):
143  # MK1996 says, 'If a %xx encoded octet is encountered it is unencoded
144  # prior to comparison, unless it is the "/" character, which has
145  # special meaning in a path.'
146  path = re.sub("%2[fF]", "\n", path)
147  path = urllib_unquote(path)
148  return path.replace("\n", "%2F")
149 
150 
151 def _scrub_data(s):
152  # Data is either a path or user agent name; i.e. the data portion of a
153  # robots.txt line. Scrubbing it consists of (a) removing extraneous
154  # whitespace, (b) turning tabs into spaces (path and UA names should not
155  # contain tabs), and (c) stripping control characters which, like tabs,
156  # shouldn't be present. (See MK1996 section 3.3 "Formal Syntax".)
157  s = _control_characters_regex.sub("", s)
158  s = s.replace("\t", " ")
159  return s.strip()
160 
161 
163  media_type = ""
164  encoding = ""
165 
166  # A typical content-type looks like this:
167  # text/plain; charset=UTF-8
168  # The portion after "text/plain" is optional and often not present.
169  # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7
170 
171  if header:
172  header = header.strip().lower()
173  else:
174  header = ""
175 
176  chunks = [s.strip() for s in header.split(";")]
177  media_type = chunks[0]
178  if len(chunks) > 1:
179  for parameter in chunks[1:]:
180  m = _charset_extraction_regex.search(parameter)
181  if m and m.group("encoding"):
182  encoding = m.group("encoding")
183 
184  return media_type.strip(), encoding.strip()
185 
186 
187 class _Ruleset(object):
188  """ _Ruleset represents a set of allow/disallow rules (and possibly a
189  crawl delay) that apply to a set of user agents.
190 
191  Users of this module don't need this class. It's available at the module
192  level only because RobotExclusionRulesParser() instances can't be
193  pickled if _Ruleset isn't visible a the module level.
194  """
195  ALLOW = 1
196  DISALLOW = 2
197 
198  def __init__(self):
199  self.robot_names = [ ]
200  self.rules = [ ]
201  self.crawl_delay = None
202 
203  def __str__(self):
204  s = self.__unicode__()
205  if PY_MAJOR_VERSION == 2:
206  s = s.encode("utf-8")
207 
208  return s
209 
210  def __unicode__(self):
211  d = { self.ALLOW : "Allow", self.DISALLOW : "Disallow" }
212 
213  s = ''.join( ["User-agent: %s\n" % name for name in self.robot_names] )
214 
215  if self.crawl_delay:
216  s += "Crawl-delay: %s\n" % self.crawl_delay
217 
218  s += ''.join( ["%s: %s\n" % (d[rule_type], path) for rule_type, path in self.rules] )
219 
220  return s
221 
222  def add_robot_name(self, bot):
223  self.robot_names.append(bot)
224 
225  def add_allow_rule(self, path):
226  self.rules.append((self.ALLOW, _unquote_path(path)))
227 
228  def add_disallow_rule(self, path):
229  self.rules.append((self.DISALLOW, _unquote_path(path)))
230 
231  def is_not_empty(self):
232  return bool(len(self.rules)) and bool(len(self.robot_names))
233 
234  def is_default(self):
235  return bool('*' in self.robot_names)
236 
237  def does_user_agent_match(self, user_agent):
238  match = False
239 
240  for robot_name in self.robot_names:
241  # MK1994 says, "A case insensitive substring match of the name
242  # without version information is recommended." MK1996 3.2.1
243  # states it even more strongly: "The robot must obey the first
244  # record in /robots.txt that contains a User-Agent line whose
245  # value contains the name token of the robot as a substring.
246  # The name comparisons are case-insensitive."
247  match = match or (robot_name == '*') or (robot_name.lower() in user_agent.lower())
248 
249  return match
250 
251  def is_url_allowed(self, url, syntax=GYM2008):
252  allowed = True
253 
254  # Schemes and host names are not part of the robots.txt protocol,
255  # so I ignore them. It is the caller's responsibility to make
256  # sure they match.
257  _, _, path, parameters, query, fragment = urllib_urlparse(url)
258  url = urllib_urlunparse(("", "", path, parameters, query, fragment))
259 
260  url = _unquote_path(url)
261 
262  done = False
263  i = 0
264  while not done:
265  rule_type, path = self.rules[i]
266 
267  if (syntax == GYM2008) and ("*" in path or path.endswith("$")):
268  # GYM2008-specific syntax applies here
269  # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40360
270  if path.endswith("$"):
271  appendix = "$"
272  path = path[:-1]
273  else:
274  appendix = ""
275  parts = path.split("*")
276  pattern = "%s%s" % (".*".join([re.escape(p) for p in parts]), appendix)
277  if re.match(pattern, url):
278  # Ding!
279  done = True
280  allowed = (rule_type == self.ALLOW)
281  else:
282  # Wildcards are either not present or are taken literally.
283  if url.startswith(path):
284  # Ding!
285  done = True
286  allowed = (rule_type == self.ALLOW)
287  # A blank path means "nothing", so that effectively
288  # negates the value above.
289  # e.g. "Disallow: " means allow everything
290  if not path:
291  allowed = not allowed
292 
293  i += 1
294  if i == len(self.rules):
295  done = True
296 
297  return allowed
298 
299 
301  """A parser for robots.txt files."""
302  def __init__(self):
303  self._source_url = ""
304  self.user_agent = None
305  self.use_local_time = True
306  self.expiration_date = self._now() + SEVEN_DAYS
307  self._response_code = 0
308  self._sitemaps = [ ]
309  self.__rulesets = [ ]
310 
311 
312  @property
313  def source_url(self):
314  """The URL from which this robots.txt was fetched. Read only."""
315  return self._source_url
316 
317  @property
318  def response_code(self):
319  """The remote server's response code. Read only."""
320  return self._response_code
321 
322  @property
323  def sitemap(self):
324  """Deprecated; use 'sitemaps' instead. Returns the sitemap URL present
325  in the robots.txt, if any. Defaults to None. Read only."""
326  _raise_error(DeprecationWarning, "The sitemap property is deprecated. Use 'sitemaps' instead.")
327 
328  @property
329  def sitemaps(self):
330  """The sitemap URLs present in the robots.txt, if any. Defaults
331  to an empty list. Read only."""
332  # I return a copy of the list so the caller can manipulate the list
333  # without affecting self._sitemaps.
334  return self._sitemaps[:]
335 
336  @property
337  def is_expired(self):
338  """True if the difference between now and the last call to fetch()
339  exceeds the robots.txt expiration. Read only.
340  """
341  return self.expiration_date <= self._now()
342 
343 
344  def _now(self):
345  if self.use_local_time:
346  return time.time()
347  else:
348  # What the heck is timegm() doing in the calendar module?!?
349  return calendar.timegm(time.gmtime())
350 
351 
352  def is_allowed(self, user_agent, url, syntax=GYM2008):
353  """True if the user agent is permitted to visit the URL. The syntax
354  parameter can be GYM2008 (the default) or MK1996 for strict adherence
355  to the traditional standard.
356  """
357  # The robot rules are stored internally as Unicode. The two lines
358  # below ensure that the parameters passed to this function are
359  # also Unicode. If those lines were not present and the caller
360  # passed a non-Unicode user agent or URL string to this function,
361  # Python would silently convert it to Unicode before comparing it
362  # to the robot rules. Such conversions use the default encoding
363  # (usually US-ASCII) and if the string couldn't be converted using
364  # that encoding, Python would raise a UnicodeError later on in the
365  # guts of this code which would be confusing.
366  # Converting the strings to Unicode here doesn't make the problem
367  # go away but it does make the conversion explicit so that
368  # failures are easier to understand.
369  if not isinstance(user_agent, unicode):
370  user_agent = user_agent.decode()
371  if not isinstance(url, unicode):
372  url = url.decode()
373 
374  if syntax not in (MK1996, GYM2008):
375  _raise_error(ValueError, "Syntax must be MK1996 or GYM2008")
376 
377  for ruleset in self.__rulesets:
378  if ruleset.does_user_agent_match(user_agent):
379  return ruleset.is_url_allowed(url, syntax)
380 
381  return True
382 
383 
384  def get_crawl_delay(self, user_agent):
385  """Returns a float representing the crawl delay specified for this
386  user agent, or None if the crawl delay was unspecified or not a float.
387  """
388  # See is_allowed() comment about the explicit unicode conversion.
389  if (PY_MAJOR_VERSION < 3) and (not isinstance(user_agent, unicode)):
390  user_agent = user_agent.decode()
391 
392  for ruleset in self.__rulesets:
393  if ruleset.does_user_agent_match(user_agent):
394  return ruleset.crawl_delay
395 
396  return None
397 
398 
399  def fetch(self, url, timeout=None):
400  """Attempts to fetch the URL requested which should refer to a
401  robots.txt file, e.g. http://example.com/robots.txt.
402  """
403 
404  # ISO-8859-1 is the default encoding for text files per the specs for
405  # HTTP 1.0 (RFC 1945 sec 3.6.1) and HTTP 1.1 (RFC 2616 sec 3.7.1).
406  # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
407  encoding = "iso-8859-1"
408  content = ""
409  expires_header = None
410  content_type_header = None
411  self._response_code = 0
412  self._source_url = url
413 
414  if self.user_agent:
415  req = urllib_request.Request(url, None, { 'User-Agent' : self.user_agent })
416  else:
417  req = urllib_request.Request(url)
418 
419  try:
420  if timeout:
421  f = urllib_request.urlopen(req, timeout=timeout)
422  else:
423  f = urllib_request.urlopen(req)
424 
425  content = f.read(MAX_FILESIZE)
426  # As of Python 2.5, f.info() looks like it returns the HTTPMessage
427  # object created during the connection.
428  expires_header = f.info().get("expires")
429  content_type_header = f.info().get("Content-Type")
430  # As of Python 2.4, this file-like object reports the response
431  # code, too.
432  if hasattr(f, "code"):
433  self._response_code = f.code
434  else:
435  self._response_code = 200
436  f.close()
437  except urllib_error.URLError:
438  # This is a slightly convoluted way to get the error instance,
439  # but it works under Python 2 & 3.
440  error_instance = sys.exc_info()
441  if len(error_instance) > 1:
442  error_instance = error_instance[1]
443  if hasattr(error_instance, "code"):
444  self._response_code = error_instance.code
445 
446  # MK1996 section 3.4 says, "...robots should take note of Expires
447  # header set by the origin server. If no cache-control directives
448  # are present robots should default to an expiry of 7 days".
449 
450  # This code is lazy and looks at the Expires header but not
451  # Cache-Control directives.
452  self.expiration_date = None
453  if self._response_code >= 200 and self._response_code < 300:
454  # All's well.
455  if expires_header:
456  self.expiration_date = email_utils.parsedate_tz(expires_header)
457 
458  if self.expiration_date:
459  # About time zones -- the call to parsedate_tz() returns a
460  # 10-tuple with the time zone offset in the 10th element.
461  # There are 3 valid formats for HTTP dates, and one of
462  # them doesn't contain time zone information. (UTC is
463  # implied since all HTTP header dates are UTC.) When given
464  # a date that lacks time zone information, parsedate_tz()
465  # returns None in the 10th element. mktime_tz() interprets
466  # None in the 10th (time zone) element to mean that the
467  # date is *local* time, not UTC.
468  # Therefore, if the HTTP timestamp lacks time zone info
469  # and I run that timestamp through parsedate_tz() and pass
470  # it directly to mktime_tz(), I'll get back a local
471  # timestamp which isn't what I want. To fix this, I simply
472  # convert a time zone of None to zero. It's much more
473  # difficult to explain than to fix. =)
474  # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1
475  if self.expiration_date[9] == None:
476  self.expiration_date = self.expiration_date[:9] + (0,)
477 
478  self.expiration_date = email_utils.mktime_tz(self.expiration_date)
479  if self.use_local_time:
480  # I have to do a little more converting to get this
481  # UTC timestamp into localtime.
482  self.expiration_date = time.mktime(time.gmtime(self.expiration_date))
483  #else:
484  # The expires header was garbage.
485 
486  if not self.expiration_date: self.expiration_date = self._now() + SEVEN_DAYS
487 
488  if (self._response_code >= 200) and (self._response_code < 300):
489  # All's well.
490  media_type, encoding = _parse_content_type_header(content_type_header)
491  # RFC 2616 sec 3.7.1 --
492  # When no explicit charset parameter is provided by the sender,
493  # media subtypes of the "text" type are defined to have a default
494  # charset value of "ISO-8859-1" when received via HTTP.
495  # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
496  if not encoding:
497  encoding = "iso-8859-1"
498  elif self._response_code in (401, 403):
499  # 401 or 403 ==> Go away or I will taunt you a second time!
500  # (according to MK1996)
501  content = "User-agent: *\nDisallow: /\n"
502  elif self._response_code == 404:
503  # No robots.txt ==> everyone's welcome
504  content = ""
505  else:
506  # Uh-oh. I punt this up to the caller.
507  _raise_error(urllib_error.URLError, self._response_code)
508 
509  if((PY_MAJOR_VERSION == 2) and isinstance(content, str)) or \
510  ((PY_MAJOR_VERSION > 2) and (not isinstance(content, str))):
511  # This ain't Unicode yet! It needs to be.
512 
513  # Unicode decoding errors are another point of failure that I punt
514  # up to the caller.
515  try:
516  content = content.decode(encoding)
517  except UnicodeError:
518  _raise_error(UnicodeError,
519  "Robots.txt contents are not in the encoding expected (%s)." % encoding)
520  except (LookupError, ValueError):
521  # LookupError ==> Python doesn't have a decoder for that encoding.
522  # One can also get a ValueError here if the encoding starts with
523  # a dot (ASCII 0x2e). See Python bug 1446043 for details. This
524  # bug was supposedly fixed in Python 2.5.
525  _raise_error(UnicodeError, "I don't understand the encoding \"%s\"." % encoding)
526 
527  # Now that I've fetched the content and turned it into Unicode, I
528  # can parse it.
529  self.parse(content)
530 
531 
532  def parse(self, s):
533  """Parses the passed string as a set of robots.txt rules."""
534  self._sitemaps = []
535  self.__rulesets = []
536 
537  if(PY_MAJOR_VERSION > 2) and (isinstance(s, bytes) or isinstance(s, bytearray)) or \
538  (PY_MAJOR_VERSION == 2) and (not isinstance(s, unicode)):
539  s = s.decode("iso-8859-1")
540 
541  # Normalize newlines.
542  s = _end_of_line_regex.sub("\n", s)
543 
544  lines = s.split("\n")
545 
546  previous_line_was_a_user_agent = False
547  current_ruleset = None
548 
549  for line in lines:
550  line = line.strip()
551 
552  if line and line[0] == '#':
553  # "Lines containing only a comment are discarded completely,
554  # and therefore do not indicate a record boundary." (MK1994)
555  pass
556  else:
557  # Remove comments
558  i = line.find("#")
559  if i != -1: line = line[:i]
560 
561  line = line.strip()
562 
563  if not line:
564  # An empty line indicates the end of a ruleset.
565  if current_ruleset and current_ruleset.is_not_empty():
566  self.__rulesets.append(current_ruleset)
567 
568  current_ruleset = None
569  previous_line_was_a_user_agent = False
570  else:
571  # Each non-empty line falls into one of six categories:
572  # 1) User-agent: blah blah blah
573  # 2) Disallow: blah blah blah
574  # 3) Allow: blah blah blah
575  # 4) Crawl-delay: blah blah blah
576  # 5) Sitemap: blah blah blah
577  # 6) Everything else
578  # 1 - 5 are interesting and I find them with the regex
579  # below. Category 6 I discard as directed by the MK1994
580  # ("Unrecognised headers are ignored.")
581  # Note that 4 & 5 are specific to GYM2008 syntax, but
582  # respecting them here is not a problem. They're just
583  # additional information the the caller is free to ignore.
584  matches = _directive_regex.findall(line)
585 
586  # Categories 1 - 5 produce two matches, #6 produces none.
587  if matches:
588  field, data = matches[0]
589  field = field.lower()
590  data = _scrub_data(data)
591 
592  # Matching "useragent" is a deviation from the
593  # MK1994/96 which permits only "user-agent".
594  if field in ("useragent", "user-agent"):
595  if previous_line_was_a_user_agent:
596  # Add this UA to the current ruleset
597  if current_ruleset and data:
598  current_ruleset.add_robot_name(data)
599  else:
600  # Save the current ruleset and start a new one.
601  if current_ruleset and current_ruleset.is_not_empty():
602  self.__rulesets.append(current_ruleset)
603  #else:
604  # (is_not_empty() == False) ==> malformed
605  # robots.txt listed a UA line but provided
606  # no name or didn't provide any rules
607  # for a named UA.
608  current_ruleset = _Ruleset()
609  if data:
610  current_ruleset.add_robot_name(data)
611 
612  previous_line_was_a_user_agent = True
613  elif field == "allow":
614  previous_line_was_a_user_agent = False
615  if current_ruleset:
616  current_ruleset.add_allow_rule(data)
617  elif field == "sitemap":
618  previous_line_was_a_user_agent = False
619  self._sitemaps.append(data)
620  elif field == "crawl-delay":
621  # Only Yahoo documents the syntax for Crawl-delay.
622  # ref: http://help.yahoo.com/l/us/yahoo/search/webcrawler/slurp-03.html
623  previous_line_was_a_user_agent = False
624  if current_ruleset:
625  try:
626  current_ruleset.crawl_delay = float(data)
627  except ValueError:
628  # Invalid crawl-delay -- ignore.
629  pass
630  else:
631  # This is a disallow line
632  previous_line_was_a_user_agent = False
633  if current_ruleset:
634  current_ruleset.add_disallow_rule(data)
635 
636  if current_ruleset and current_ruleset.is_not_empty():
637  self.__rulesets.append(current_ruleset)
638 
639  # Now that I have all the rulesets, I want to order them in a way
640  # that makes comparisons easier later. Specifically, any ruleset that
641  # contains the default user agent '*' should go at the end of the list
642  # so that I only apply the default as a last resort. According to
643  # MK1994/96, there should only be one ruleset that specifies * as the
644  # user-agent, but you know how these things go.
645  not_defaults = [r for r in self.__rulesets if not r.is_default()]
646  defaults = [r for r in self.__rulesets if r.is_default()]
647 
648  self.__rulesets = not_defaults + defaults
649 
650 
651  def __str__(self):
652  s = self.__unicode__()
653  if PY_MAJOR_VERSION == 2:
654  s = s.encode("utf-8")
655 
656  return s
657 
658  def __unicode__(self):
659  if self._sitemaps:
660  s = "Sitemaps: %s\n\n" % self._sitemaps
661  else:
662  s = ""
663  if PY_MAJOR_VERSION < 3:
664  s = unicode(s)
665  # I also need to string-ify each ruleset. The function for doing so
666  # varies under Python 2/3.
667  stringify = (unicode if (PY_MAJOR_VERSION == 2) else str)
668  return s + '\n'.join( [stringify(ruleset) for ruleset in self.__rulesets] )
669 
670 
672  """A drop-in replacement for the Python standard library's RobotFileParser
673  that retains all of the features of RobotExclusionRulesParser.
674  """
675  def __init__(self, url = ""):
676  RobotExclusionRulesParser.__init__(self)
677 
679  self.last_checked = None
680 
681  self.set_url(url)
682 
683 
684  def set_url(self, url):
685  # I don't want to stuff this into self._source_url because
686  # _source_url is set only as a side effect of calling fetch().
687  self._user_provided_url = url
688 
689 
690  def read(self):
691  RobotExclusionRulesParser.fetch(self, self._user_provided_url)
692 
693 
694  def parse(self, lines):
695  RobotExclusionRulesParser.parse(self, ''.join(lines))
696 
697 
698  def can_fetch(self, user_agent, url, syntax=GYM2008):
699  return RobotExclusionRulesParser.is_allowed(self, user_agent, url, syntax)
700 
701 
702  def mtime(self):
703  return self.last_checked
704 
705 
706  def modified(self):
707  self.last_checked = time.time()
def _raise_error(error, message)
Definition: OwnRobots.py:133
def is_allowed(self, user_agent, url, syntax=GYM2008)
Definition: OwnRobots.py:352
def _unquote_path(path)
Definition: OwnRobots.py:142
def can_fetch(self, user_agent, url, syntax=GYM2008)
Definition: OwnRobots.py:698
def is_url_allowed(self, url, syntax=GYM2008)
Definition: OwnRobots.py:251
def add_robot_name(self, bot)
Definition: OwnRobots.py:222
def add_disallow_rule(self, path)
Definition: OwnRobots.py:228
def add_allow_rule(self, path)
Definition: OwnRobots.py:225
def fetch(self, url, timeout=None)
Definition: OwnRobots.py:399
def _parse_content_type_header(header)
Definition: OwnRobots.py:162
Definition: join.py:1
def does_user_agent_match(self, user_agent)
Definition: OwnRobots.py:237