Inheritance diagram for dc_crawler.RobotsParser.RobotsParser:

Collaboration diagram for dc_crawler.RobotsParser.RobotsParser:

Public Member Functions
def	__init__ (self, headers=None, isCacheUsing=False, robotsFileDir=None)

def	initFiends (self, headers=None, isCacheUsing=False, robotsFileDir=None)

def	loadRobots (self, url, siteId=None, additionHeaders=None, proxyName=None)

def	checkUrlByRobots (self, url, siteId=None, headers=None)

Public Attributes
	localParser

	headers

	robotsFileDir

	cacheElement

	cacheElementKeys

	localCrawlerDataStorage

Static Public Attributes
	ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)

string	USER_AGENT_HEADER_NAME = "User-Agent"

int	HTTP_OK_CODE = 200

Detailed Description

Definition at line 21 of file RobotsParser.py.

Constructor & Destructor Documentation

◆ init()

def dc_crawler.RobotsParser.RobotsParser.__init__	(	self,
		headers = `None`,
		isCacheUsing = `False`,
		robotsFileDir = `None`
	)

Definition at line 30 of file RobotsParser.py.

   def __init__(self, headers=None, isCacheUsing=False, robotsFileDir=None):
     self.localParser = None
     self.headers = ["*"]
     self.robotsFileDir = None
     self.cacheElement = None
     self.cacheElementKeys = None
     self.localCrawlerDataStorage = None
     self.initFiends(headers, isCacheUsing, robotsFileDir)
 
 

Member Function Documentation

◆ checkUrlByRobots()

def dc_crawler.RobotsParser.RobotsParser.checkUrlByRobots	(	self,
		url,
		siteId = `None`,
		headers = `None`
	)

Definition at line 105 of file RobotsParser.py.

   def checkUrlByRobots(self, url, siteId=None, headers=None):
     isAllowed = True
     retUserAgent = None
     if self.localParser is not None:
       if headers is not None and self.USER_AGENT_HEADER_NAME in headers and \
       headers[self.USER_AGENT_HEADER_NAME] is not None:
         self.headers = [headers[self.USER_AGENT_HEADER_NAME]]
         self.headers.append("*")
 
       for userAgent in self.headers:
         retUserAgent = userAgent
         isAllowed = self.localParser.is_allowed(userAgent, url)
         if not isAllowed:
           break
       if not isAllowed and self.USER_AGENT_HEADER_NAME in self.headers:
         retUserAgent = self.headers[self.USER_AGENT_HEADER_NAME]
 
       if self.localCrawlerDataStorage is not None and siteId is not None and \
       self.cacheElement is not None and self.cacheElementKeys is not None:
         host = Utils.UrlParser.getDomain(url)
         if host is not None:
           self.cacheElement[self.cacheElementKeys[0]][self.cacheElementKeys[1]] += 1
           self.localCrawlerDataStorage.saveElement(self.robotsFileDir, host, siteId, self.cacheElement)
     return isAllowed, retUserAgent
 

◆ initFiends()

def dc_crawler.RobotsParser.RobotsParser.initFiends	(	self,
		headers = `None`,
		isCacheUsing = `False`,
		robotsFileDir = `None`
	)

Definition at line 45 of file RobotsParser.py.

   def initFiends(self, headers=None, isCacheUsing=False, robotsFileDir=None):
     if headers is None:
       self.headers = ["*"]
     else:
       self.headers = headers
     self.robotsFileDir = robotsFileDir
     if isCacheUsing:
       self.localCrawlerDataStorage = LFSDataStorage()
     else:
       self.localCrawlerDataStorage = None
 
 

◆ loadRobots()

def dc_crawler.RobotsParser.RobotsParser.loadRobots	(	self,
		url,
		siteId = `None`,
		additionHeaders = `None`,
		proxyName = `None`
	)

Definition at line 64 of file RobotsParser.py.

   def loadRobots(self, url, siteId=None, additionHeaders=None, proxyName=None):
     if additionHeaders is None:
       additionHeaders = {}
     contentBuf = None
     if self.localCrawlerDataStorage is not None and self.robotsFileDir is not None and siteId is not None:
       host = Utils.UrlParser.getDomain(url)
       if host is not None:
         self.cacheElement = self.localCrawlerDataStorage.loadElement(self.robotsFileDir, host, siteId)
         if self.cacheElement is not None:
           self.cacheElementKeys = None
           cek = self.localCrawlerDataStorage.fetchLowFreqHeaders(fileStorageElements=self.cacheElement,
                                                                  fileCacheOnly=True)
           if len(cek) > 0 and cek[0][0] == "robots.txt":
             self.cacheElementKeys = cek[0]
             contentBuf = self.cacheElementKeys[1]
     if contentBuf is None:
       robotsUrl = self.ROBOTS_PATTERN.sub(r'\1/robots.txt', url)
       logger.info(">>> robotsUrl: " + robotsUrl)
       try:
         if proxyName is not None and proxyName:
           response = requests.get(robotsUrl, headers=additionHeaders, allow_redirects=True,
                                   proxies={"http": "http://" + proxyName})
         else:
           response = requests.get(robotsUrl, headers=additionHeaders, allow_redirects=True)
         if response is not None and response.status_code == self.HTTP_OK_CODE:
           contentBuf = response.content
         else:
           logger.info(">>> robots.txt loading error, response is None or status_code not 200")
       except Exception as excp:
         logger.info(">>> robots.txt loading error = " + str(excp))
 
     if contentBuf is not None:
       self.localParser = OwnRobots.RobotExclusionRulesParser()
       self.localParser.parse(contentBuf)
     return not self.localParser is None
 
 

Member Data Documentation

◆ cacheElement

dc_crawler.RobotsParser.RobotsParser.cacheElement

Definition at line 34 of file RobotsParser.py.

◆ cacheElementKeys

dc_crawler.RobotsParser.RobotsParser.cacheElementKeys

Definition at line 35 of file RobotsParser.py.

◆ headers

dc_crawler.RobotsParser.RobotsParser.headers

Definition at line 32 of file RobotsParser.py.

◆ HTTP_OK_CODE

int dc_crawler.RobotsParser.RobotsParser.HTTP_OK_CODE = 200

static

Definition at line 25 of file RobotsParser.py.

◆ localCrawlerDataStorage

dc_crawler.RobotsParser.RobotsParser.localCrawlerDataStorage

Definition at line 36 of file RobotsParser.py.

◆ localParser

dc_crawler.RobotsParser.RobotsParser.localParser

Definition at line 31 of file RobotsParser.py.

◆ ROBOTS_PATTERN

dc_crawler.RobotsParser.RobotsParser.ROBOTS_PATTERN = re.compile(r'(https?://[^/]+).*', re.I)

static

Definition at line 23 of file RobotsParser.py.

◆ robotsFileDir

dc_crawler.RobotsParser.RobotsParser.robotsFileDir

Definition at line 33 of file RobotsParser.py.

◆ USER_AGENT_HEADER_NAME

string dc_crawler.RobotsParser.RobotsParser.USER_AGENT_HEADER_NAME = "User-Agent"

static

Definition at line 24 of file RobotsParser.py.

The documentation for this class was generated from the following file:

sources/hce/dc_crawler/RobotsParser.py

Public Member Functions

Public Attributes

Static Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ __init__()

Member Function Documentation

◆ checkUrlByRobots()

◆ initFiends()

◆ loadRobots()

Member Data Documentation

◆ cacheElement

◆ cacheElementKeys

◆ headers

◆ HTTP_OK_CODE

◆ localCrawlerDataStorage

◆ localParser

◆ ROBOTS_PATTERN

◆ robotsFileDir

◆ USER_AGENT_HEADER_NAME

◆ init()