HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
RobotsParser.py
Go to the documentation of this file.
1 """
2 @package: dc
3 @file RobotsParser.py
4 @author Scorp <developers.hce@gmail.com>
5 @link: http://hierarchical-cluster-engine.com/
6 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
7 @license: http://hierarchical-cluster-engine.com/license/
8 @since: 0.1
9 """
10 
11 import re
12 import requests
13 import dc_crawler.OwnRobots as OwnRobots
14 import app.Utils as Utils # pylint: disable=F0401
15 from app.LFSDataStorage import LFSDataStorage
16 
17 logger = Utils.MPLogger().getLogger()
18 
19 # #The RobotsParser performs parsing robots.txt file and checking current url by it rule.
20 #
21 class RobotsParser(object):
22 
23  ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)
24  USER_AGENT_HEADER_NAME = "User-Agent"
25  HTTP_OK_CODE = 200
26 
27  # #RobotsParser class constructor
28  #
29  # @param headers sets default value for user-agent http header
30  def __init__(self, headers=None, isCacheUsing=False, robotsFileDir=None):
31  self.localParser = None
32  self.headers = ["*"]
33  self.robotsFileDir = None
34  self.cacheElement = None
35  self.cacheElementKeys = None
37  self.initFiends(headers, isCacheUsing, robotsFileDir)
38 
39 
40  # #initFiends makes class fields initialization (not default)
41  #
42  # @param headers - external header initialization
43  # @param isCacheUsing - bool value use internal cache ot not
44  # @param robotsFileDir - path to cache file storage
45  def initFiends(self, headers=None, isCacheUsing=False, robotsFileDir=None):
46  if headers is None:
47  self.headers = ["*"]
48  else:
49  self.headers = headers
50  self.robotsFileDir = robotsFileDir
51  if isCacheUsing:
53  else:
54  self.localCrawlerDataStorage = None
55 
56 
57  # #loadRobots method loads robots.txt file for current domain (sets in the url param)
58  #
59  # @param url, urls that contains domain for robots.txt fetching
60  # @param siteId - Site ID
61  # @param additionHeaders - addition headers
62  # @param proxyName - proxy host and port as string (sample: '127.0.0.1:80')
63  # @return bool value - was robots.txt successful opened or wasn't
64  def loadRobots(self, url, siteId=None, additionHeaders=None, proxyName=None):
65  if additionHeaders is None:
66  additionHeaders = {}
67  contentBuf = None
68  if self.localCrawlerDataStorage is not None and self.robotsFileDir is not None and siteId is not None:
69  host = Utils.UrlParser.getDomain(url)
70  if host is not None:
71  self.cacheElement = self.localCrawlerDataStorage.loadElement(self.robotsFileDir, host, siteId)
72  if self.cacheElement is not None:
73  self.cacheElementKeys = None
74  cek = self.localCrawlerDataStorage.fetchLowFreqHeaders(fileStorageElements=self.cacheElement,
75  fileCacheOnly=True)
76  if len(cek) > 0 and cek[0][0] == "robots.txt":
77  self.cacheElementKeys = cek[0]
78  contentBuf = self.cacheElementKeys[1]
79  if contentBuf is None:
80  robotsUrl = self.ROBOTS_PATTERN.sub(r'\1/robots.txt', url)
81  logger.info(">>> robotsUrl: " + robotsUrl)
82  try:
83  if proxyName is not None and proxyName:
84  response = requests.get(robotsUrl, headers=additionHeaders, allow_redirects=True,
85  proxies={"http": "http://" + proxyName})
86  else:
87  response = requests.get(robotsUrl, headers=additionHeaders, allow_redirects=True)
88  if response is not None and response.status_code == self.HTTP_OK_CODE:
89  contentBuf = response.content
90  else:
91  logger.info(">>> robots.txt loading error, response is None or status_code not 200")
92  except Exception as excp:
93  logger.info(">>> robots.txt loading error = " + str(excp))
94 
95  if contentBuf is not None:
97  self.localParser.parse(contentBuf)
98  return not self.localParser is None
99 
100 
101  # #checkUrlByRobots check incoming url in preloaded robots.txt
102  #
103  # @param headers
104  # @return is collect successfully
105  def checkUrlByRobots(self, url, siteId=None, headers=None):
106  isAllowed = True
107  retUserAgent = None
108  if self.localParser is not None:
109  if headers is not None and self.USER_AGENT_HEADER_NAME in headers and \
110  headers[self.USER_AGENT_HEADER_NAME] is not None:
111  self.headers = [headers[self.USER_AGENT_HEADER_NAME]]
112  self.headers.append("*")
113 
114  for userAgent in self.headers:
115  retUserAgent = userAgent
116  isAllowed = self.localParser.is_allowed(userAgent, url)
117  if not isAllowed:
118  break
119  if not isAllowed and self.USER_AGENT_HEADER_NAME in self.headers:
120  retUserAgent = self.headers[self.USER_AGENT_HEADER_NAME]
121 
122  if self.localCrawlerDataStorage is not None and siteId is not None and \
123  self.cacheElement is not None and self.cacheElementKeys is not None:
124  host = Utils.UrlParser.getDomain(url)
125  if host is not None:
126  self.cacheElement[self.cacheElementKeys[0]][self.cacheElementKeys[1]] += 1
127  self.localCrawlerDataStorage.saveElement(self.robotsFileDir, host, siteId, self.cacheElement)
128  return isAllowed, retUserAgent
def loadRobots(self, url, siteId=None, additionHeaders=None, proxyName=None)
Definition: RobotsParser.py:64
def __init__(self, headers=None, isCacheUsing=False, robotsFileDir=None)
Definition: RobotsParser.py:30
def checkUrlByRobots(self, url, siteId=None, headers=None)
def initFiends(self, headers=None, isCacheUsing=False, robotsFileDir=None)
Definition: RobotsParser.py:45