4 @author Scorp <developers.hce@gmail.com> 5 @link: http://hierarchical-cluster-engine.com/ 6 @copyright: Copyright © 2013-2014 IOIX Ukraine 7 @license: http://hierarchical-cluster-engine.com/license/ 23 ROBOTS_PATTERN = re.compile(
r'(https?://[^/]+).*', re.I)
24 USER_AGENT_HEADER_NAME =
"User-Agent" 30 def __init__(self, headers=None, isCacheUsing=False, robotsFileDir=None):
37 self.
initFiends(headers, isCacheUsing, robotsFileDir)
45 def initFiends(self, headers=None, isCacheUsing=False, robotsFileDir=None):
64 def loadRobots(self, url, siteId=None, additionHeaders=None, proxyName=None):
65 if additionHeaders
is None:
69 host = Utils.UrlParser.getDomain(url)
76 if len(cek) > 0
and cek[0][0] ==
"robots.txt":
79 if contentBuf
is None:
81 logger.info(
">>> robotsUrl: " + robotsUrl)
83 if proxyName
is not None and proxyName:
84 response = requests.get(robotsUrl, headers=additionHeaders, allow_redirects=
True,
85 proxies={
"http":
"http://" + proxyName})
87 response = requests.get(robotsUrl, headers=additionHeaders, allow_redirects=
True)
88 if response
is not None and response.status_code == self.
HTTP_OK_CODE:
89 contentBuf = response.content
91 logger.info(
">>> robots.txt loading error, response is None or status_code not 200")
92 except Exception
as excp:
93 logger.info(
">>> robots.txt loading error = " + str(excp))
95 if contentBuf
is not None:
115 retUserAgent = userAgent
116 isAllowed = self.
localParser.is_allowed(userAgent, url)
124 host = Utils.UrlParser.getDomain(url)
128 return isAllowed, retUserAgent
def loadRobots(self, url, siteId=None, additionHeaders=None, proxyName=None)
def __init__(self, headers=None, isCacheUsing=False, robotsFileDir=None)
string USER_AGENT_HEADER_NAME
def checkUrlByRobots(self, url, siteId=None, headers=None)
def initFiends(self, headers=None, isCacheUsing=False, robotsFileDir=None)