727 def crawl(self, incomingContent):
728 self.crawledResource =
None 730 self.logger.debug(
"Make request delay " + str(self.url.requestDelay / 1000.0) +
" sec.")
731 time.sleep(self.url.requestDelay / 1000.0)
734 self.logger.debug(
"!!! self.url.url = '%s'", str(self.url.url))
736 self.urlProcess.url = self.url.url
737 self.urlProcess.site = self.site
739 url = self.urlProcess.getRealUrl()
741 startTime = time.time()
743 self.processRotatedHeaders(url)
745 self.logger.debug(
"!!! url = '%s'", str(url))
748 if 'FETCHER_MACRO' in self.siteProperties
and self.siteProperties[
'FETCHER_MACRO']
is not None\
749 and self.siteProperties[
'FETCHER_MACRO'] !=
'':
751 macro = json.loads(self.siteProperties[
'FETCHER_MACRO'])
752 except Exception, err:
753 self.logger.
error(
"Initialization of macro error: %s, source: %s", str(err),
754 str(self.siteProperties[
'FETCHER_MACRO']))
755 self.errorMask = self.errorMask | APP_CONSTS.ERROR_MACRO_DESERIALIZATION
756 self.updateSiteParams(APP_CONSTS.ERROR_MACRO_DESERIALIZATION)
757 self.updateURLForFailed(self.errorMask)
763 if self.authName
and self.authPwd:
764 auth = (self.authName, self.authPwd)
765 self.logger.info(
"using http basic auth %s:%s", self.authName, self.authPwd)
771 self.urlProcess.urlObj = self.url
772 postData = self.urlProcess.resolveHTTP(self.postForms, self.httpApplyHeaders)
776 url = self.urlProcess.urlTemplateApply(url, self.batch.crawlerType, self.urlTempalteRegular,
777 self.urlTempalteRealtime, self.urlTempalteRegularEncode,
778 self.urlTempalteRealtimeEncode)
780 self.logger.debug(
"!!! urlTemplateApply() return url = '%s'", str(url))
783 if not CrawlerTask.isAvailableUrl(siteProperties=self.siteProperties, url=url, logger=self.logger):
784 self.logger.debug(
"Host '%s' is not available!", str(url))
785 raise UrlAvailableException(
"Host '%s' is not available!" % str(url))
788 if "ROBOTS_MODE" not in self.siteProperties
or int(self.siteProperties[
"ROBOTS_MODE"]) > 0:
789 self.logger.debug(
"Robots.txt obey mode ON")
792 proxyName = CrawlerTask.getProxyName(siteProperties=self.siteProperties,
795 dbWrapper=self.dbWrapper,
798 if self.robotsParser.loadRobots(url, self.batchItem.siteId, self.httpApplyHeaders, proxyName):
799 self.httpApplyHeaders = self.updateHeadersByCookies(self.httpApplyHeaders,
801 HTTPCookieResolver.STAGE_ROBOTS)
803 isAllowed, retUserAgent = self.robotsParser.checkUrlByRobots(url, self.batchItem.siteId,
804 self.httpApplyHeaders)
806 self.logger.debug(
">>> URL " + url +
" is NOT Allowed by user-agent:" + str(retUserAgent))
807 self.errorMask = self.errorMask | APP_CONSTS.ERROR_ROBOTS_NOT_ALLOW
808 self.updateSiteParams(APP_CONSTS.ERROR_ROBOTS_NOT_ALLOW)
809 self.updateURLForFailed(self.errorMask)
815 res = self.makeDefaultResponse(Response())
817 self.logger.debug(
"!!! localUrl = '%s'", str(localUrl))
819 retriesCount = HTTPProxyResolver.getTriesCount(self.siteProperties)
822 for count
in range(0, retriesCount + 1):
823 self.logger.debug(
"retriesCount = %s, count = %s", str(retriesCount), str(count))
825 HTTPProxyResolver.checkTriesCount(siteProperties=self.siteProperties, currentTriesCount=proxyTriesCount)
828 proxyName = CrawlerTask.getProxyName(siteProperties=self.siteProperties,
831 dbWrapper=self.dbWrapper,
835 if proxyName
is not None:
839 self.logger.debug(
"Use headers: %s type: %s", str(self.httpApplyHeaders), str(
type(self.httpApplyHeaders)))
840 self.logger.info(
"start to fetch: %s", localUrl)
841 res = self.httpRequestWrapper(localUrl, self.httpApplyHeaders, auth, postData, self.url, incomingContent, \
843 except SeleniumFetcherException, err:
844 self.logger.debug(
"!!! httpRequestWrapper return error: %s", str(err))
845 CrawlerTask.addProxyFaults(siteProperties=self.siteProperties,
848 dbWrapper=self.dbWrapper)
852 if CrawlerTask.isNeedRotateProxy(siteProperties=self.siteProperties,
855 dbWrapper=self.dbWrapper,
856 rawContent=res.rendered_unicode_content):
857 self.logger.debug(
'Necessary rotate proxy. Go to the next...')
860 if res
is not None and res.error_mask != APP_CONSTS.ERROR_OK:
861 self.logger.debug(
"res.error_mask = %s", str(res.error_mask))
863 elif res
is None or self.max_html_redirects
is None or \
864 self.max_html_redirects < CONSTS.MAX_HTML_REDIRECTS_LIMIT
or \
865 not self.allow_html_redirects:
867 elif self.max_html_redirects > 0
and self.htmlRedirects >= self.max_html_redirects:
868 self.logger.debug(
"Max html redirects reached %s>=%s", str(self.htmlRedirects), str(self.max_html_redirects))
869 self.errorMask = self.errorMask | APP_CONSTS.ERROR_MAX_ALLOW_HTML_REDIRECTS
870 self.updateURLForFailed(APP_CONSTS.ERROR_MAX_ALLOW_HTML_REDIRECTS)
872 elif res.rendered_unicode_content
is not None:
873 if 'content-type' in res.headers
and res.headers[
'content-type'].find(
'text/html') > -1:
876 if self.site.fetchType == BaseFetcher.TYP_DYNAMIC:
877 res.rendered_unicode_content = Utils.eraseNoScript(res.rendered_unicode_content)
880 localUrl = Utils.getHTMLRedirectUrl(res.rendered_unicode_content, self.logger)
881 except Exception, err:
882 self.logger.
error(
"Error: %s", str(err))
883 self.logger.info(Utils.getTracebackInfo())
885 self.logger.debug(
"!!! HTML redirect to '%s'", str(localUrl))
886 if localUrl
is None or localUrl ==
'':
888 elif res.status_code != CONSTS.HTTP_CODE_200
and res.status_code
not in CONSTS.REDIRECT_HTTP_CODES:
889 self.logger.debug(
"!!! Url skipped, because http code = '%s'", str(res.status_code))
894 collectURLs = CollectURLs()
895 isAllowedByFilter = collectURLs.filtersApply(self.site.filters, localUrl, 0, self.dbWrapper, \
896 self.batchItem.siteId,
None, \
897 Filters.OC_RE, Filters.STAGE_COLLECT_URLS)
898 if not isAllowedByFilter:
899 localUrl = urlparse.urljoin(prevUrl, localUrl)
901 localUrl = dc_event.URL(0, localUrl, normalizeMask=self.normMask).getURL(self.normMask)
902 self.logger.debug(
"HTML redirect: %s, is allowed by filters: %s", localUrl, str(bool(isAllowedByFilter)))
903 self.htmlRedirects += 1
911 if res
is not None and res.error_mask != 0:
912 self.logger.debug(
"Positive res.error_mask: %s", str(res.error_mask))
913 self.updateURLForFailed(res.error_mask)
914 self.errorMask = self.errorMask | res.error_mask
917 if res
is not None and res.headers
is not None and "content-length" in res.headers
and \
918 res.headers[
"content-length"] == EMPTY_RESPONSE_SIZE:
919 self.logger.debug(
'Zero content-length!')
920 self.errorMask = self.errorMask | APP_CONSTS.ERROR_EMPTY_RESPONSE
921 self.updateURLForFailed(self.errorMask)
929 self.logger.info(
"!!! response code: '%s'", str(self.res.status_code))
930 self.logger.info(
"!!! response cookies: '%s'", str(self.res.cookies))
932 self.crawledTime = time.time()
933 self.resourceProcess.urlObj = self.url
934 resource = self.resourceProcess.generateResource(startTime, res, self.headers, self.crawledTime,
935 self.defaultIcrCrawlTime,
936 self.siteProperties[
"CONTENT_TYPE_MAP"]
if \
937 "CONTENT_TYPE_MAP" in self.siteProperties
else None)
940 if APP_CONSTS.REPLACEMENT_CONTENT_DATA
in self.siteProperties:
941 self.logger.debug(
"!!! Found property 'REPLACE' !!!")
943 self.res.rendered_unicode_content = ContentEvaluator.executeReplace(
944 dbWrapper=self.dbWrapper,
945 siteId=self.batchItem.siteId,
946 propertyString=self.siteProperties[APP_CONSTS.REPLACEMENT_CONTENT_DATA],
947 contentData=self.res.rendered_unicode_content)
949 self.res.content_size = len(self.res.rendered_unicode_content)
953 self.cookieResolver.addCookie(url, resource.cookies)
955 resource.dynamic_fetcher_type = res.dynamic_fetcher_type
956 resource.dynamic_fetcher_result_type = res.dynamic_fetcher_result_type
957 self.crawledResource = resource
961 if self.detectModified
is not None:
962 self.detectModified.lastModified = self.crawledResource.last_modified
964 if self.crawledResource.http_code >= CONSTS.HTTP_CODE_400:
965 self.errorMask = self.errorMask | APP_CONSTS.ERROR_HTTP_ERROR
967 if self.crawledResource.http_code == CONSTS.HTTP_CODE_403:
968 self.errorMask = APP_CONSTS.ERROR_FETCH_FORBIDDEN
970 self.updateSiteParams(self.errorMask)
971 self.updateURLForFailed(self.errorMask, self.crawledResource.http_code)
979 if not self.allow_http_redirects:
980 self.errorMask = self.errorMask | APP_CONSTS.ERROR_MAX_ALLOW_HTTP_REDIRECTS
981 self.updateSiteParams(APP_CONSTS.ERROR_MAX_ALLOW_HTTP_REDIRECTS)
982 self.updateURLForFailed(self.errorMask)
988 self.resourceProcess.resource = resource
989 self.resourceProcess.batchItem = self.batchItem
990 if not self.resourceProcess.checkResourcesResponse(res, self.site.maxResourceSize, self.updateSiteParams):
991 self.errorMask = self.errorMask | resource.error_mask
992 self.updateURLForFailed(self.errorMask, res.status_code)
997 self.logger.debug(
"+++++++++++++++++++++++++++++++++++++")
998 self.logger.debug(
"Block handlers 'STAGE_BEFORE_DOM_PRE'")
999 collectURLs = CollectURLs()
1000 self.logger.debug(
"self.site.filters: " +
varDump(self.site.filters))
1002 localFilters = Filters(
None, self.dbWrapper, self.batchItem.siteId, 0,
1003 None, Filters.OC_RE, Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_RAW_CONTENT)
1005 self.logger.debug(
'>>> localFilters.filters: ' +
varDump(localFilters.filters))
1006 self.logger.debug(
">>> isExistStage('STAGE_BEFORE_DOM_PRE'): " + \
1007 str(localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE)))
1009 if localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE):
1010 self.logger.debug(
"Check RAW content text regular expression ...")
1011 if collectURLs.filtersApply(
None, resource.binary_content, 0, self.dbWrapper,
1012 self.batchItem.siteId,
None, Filters.OC_RE,
1013 Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_RAW_CONTENT,
True):
1014 self.logger.debug(
"RAW content text regular expression check SUCCESS")
1016 self.logger.debug(
"RAW content text regular expression check FAILED")
1017 self.errorMask = self.errorMask | APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
1018 self.updateURLForFailed(self.errorMask)
1023 localFilters = Filters(
None, self.dbWrapper, self.batchItem.siteId, 0,
1024 None, Filters.OC_RE, Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_HEADERS_ALL)
1026 self.logger.debug(
'>>> localFilters.filters: ' +
varDump(localFilters.filters))
1027 self.logger.debug(
">>> isExistStage('STAGE_BEFORE_DOM_PRE'): " + \
1028 str(localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE)))
1030 if localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE):
1031 self.logger.debug(
"Check HTTP headers by name text regular expression check ...")
1032 if collectURLs.filtersApply(
None, resource.response_header, 0, self.dbWrapper,
1033 self.batchItem.siteId,
None, Filters.OC_RE,
1034 Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_HEADERS_ALL,
True):
1035 self.logger.debug(
"HTTP headers by name text regular expression check SUCCESS")
1037 self.logger.debug(
"HTTP headers by name text regular expression check FAILED")
1038 self.errorMask = self.errorMask | APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
1039 self.updateURLForFailed(self.errorMask)
1044 self.logger.debug(
"Check Last modified datetime value date comparison check ...")
1045 self.logger.debug(
'resource.last_modified = ' + str(resource.last_modified))
1047 localFilters = Filters(
None, self.dbWrapper, self.batchItem.siteId, 0,
1048 {
'PDATE':str(resource.last_modified)}, Filters.OC_SQLE, Filters.STAGE_BEFORE_DOM_PRE,
1049 Filters.SELECT_SUBJECT_LAST_MODIFIED)
1051 self.logger.debug(
'>>> localFilters.filters: ' +
varDump(localFilters.filters))
1052 self.logger.debug(
">>> isExistStage('STAGE_BEFORE_DOM_PRE'): " + \
1053 str(localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE)))
1055 if localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE):
1056 if collectURLs.filtersApply(
None,
'', 0, self.dbWrapper, self.batchItem.siteId,
1057 {
'PDATE':str(resource.last_modified)}, Filters.OC_SQLE,
1058 Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_LAST_MODIFIED,
True):
1059 self.logger.debug(
"Last modified datetime value date comparison check SUCCESS")
1061 self.logger.debug(
"Last modified datetime value date comparison check FAILED")
1062 self.errorMask = self.errorMask | APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
1063 self.updateURLForFailed(self.errorMask)
1068 except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
1069 self.errorMask = self.errorMask | APP_CONSTS.ERROR_REQUEST_TIMEOUT
1070 self.updateURLForFailed(APP_CONSTS.ERROR_REQUEST_TIMEOUT)
1071 self.res = self.makeDefaultResponse(self.res)
1073 except requests.exceptions.InvalidURL:
1074 self.errorMask = self.errorMask | APP_CONSTS.ERROR_BAD_URL
1075 self.updateURLForFailed(APP_CONSTS.ERROR_BAD_URL)
1076 self.res = self.makeDefaultResponse(self.res)
1078 except requests.exceptions.TooManyRedirects:
1079 self.errorMask = self.errorMask | APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
1080 self.updateURLForFailed(APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS)
1081 self.res = self.makeDefaultResponse(self.res)
1083 except requests.exceptions.ChunkedEncodingError:
1084 self.errorMask = self.errorMask | APP_CONSTS.ERROR_PAGE_CONVERT_ERROR
1085 self.updateURLForFailed(APP_CONSTS.ERROR_PAGE_CONVERT_ERROR)
1086 self.res = self.makeDefaultResponse(self.res)
1088 except requests.exceptions.ConnectionError:
1089 self.errorMask = self.errorMask | APP_CONSTS.ERROR_CONNECTION_ERROR
1090 self.updateURLForFailed(APP_CONSTS.ERROR_CONNECTION_ERROR)
1091 self.res = self.makeDefaultResponse(self.res)
1093 except requests.exceptions.ContentDecodingError:
1094 self.errorMask = self.errorMask | APP_CONSTS.ERROR_PAGE_CONVERT_ERROR
1095 self.updateURLForFailed(APP_CONSTS.ERROR_PAGE_CONVERT_ERROR)
1096 self.res = self.makeDefaultResponse(self.res)
1098 except lxml.etree.XMLSyntaxError:
1099 self.logger.debug(
"XML HTML syntax error")
1100 self.errorMask = self.errorMask | APP_CONSTS.ERROR_DTD_INVALID
1101 self.updateURLForFailed(APP_CONSTS.ERROR_DTD_INVALID)
1102 self.res = self.makeDefaultResponse(self.res)
1104 except ProxyException, err:
1105 self.logger.debug(
'self.errorMask = ' + str(self.errorMask) +
' err.code = ' + str(err.code) + \
1106 ' err.statusUpdate = ' + str(err.statusUpdate))
1107 status = dc_event.URL.STATUS_CRAWLED
1108 if err.statusUpdate
is not None:
1109 status = err.statusUpdate
1110 self.logger.debug(
'Set status update = ' + str(status))
1111 self.errorMask = self.errorMask | err.code
1112 self.updateURLForFailed(self.errorMask, SeleniumFetcher.ERROR_PROXY_CONNECTION_FAILED, status)
1113 self.res = self.makeDefaultResponse(self.res, SeleniumFetcher.ERROR_PROXY_CONNECTION_FAILED)
1115 except SeleniumFetcherException, err:
1116 self.logger.
error(
"Selenium fetcher error: " + str(err) +
' code = ' + str(err.code))
1117 httpCode = CONSTS.HTTP_CODE_400
1118 if err.code
in self.errorMaskHttpCodeDict:
1119 httpCode = self.errorMaskHttpCodeDict[err.code]
1120 self.errorMask = self.errorMask | err.code
1121 self.updateURLForFailed(self.errorMask, httpCode)
1122 self.res = self.makeDefaultResponse(self.res, httpCode)
1124 except UrlAvailableException, err:
1125 self.errorMask = self.errorMask | APP_CONSTS.ERROR_CONNECTION_ERROR
1126 self.updateURLForFailed(APP_CONSTS.ERROR_CONNECTION_ERROR)
1127 self.res = self.makeDefaultResponse(self.res)
1129 except requests.exceptions.HTTPError, err:
1130 self.errorMask = self.errorMask | APP_CONSTS.ERROR_FETCH_HTTP_ERROR
1131 self.updateURLForFailed(APP_CONSTS.ERROR_FETCH_HTTP_ERROR)
1132 self.res = self.makeDefaultResponse(self.res)
1134 except requests.exceptions.URLRequired, err:
1135 self.errorMask = self.errorMask | APP_CONSTS.ERROR_FETCH_INVALID_URL
1136 self.updateURLForFailed(APP_CONSTS.ERROR_FETCH_INVALID_URL)
1137 self.res = self.makeDefaultResponse(self.res)
1139 except requests.exceptions.RequestException, err:
1140 self.errorMask = self.errorMask | APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST
1141 self.updateURLForFailed(APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST)
1142 self.res = self.makeDefaultResponse(self.res)
1144 except CrawlerFilterException, err:
1145 self.errorMask = self.errorMask | APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
1146 self.updateURLForFailed(APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK)
1147 self.res = self.makeDefaultResponse(self.res)
1149 except NotModifiedException, err:
1150 status = dc_event.URL.STATUS_CRAWLED
1152 if self.detectModified
is not None:
1153 status, updateUDate = self.detectModified.notModifiedStateProcessing(self.batchItem.siteId, self.realUrl,
1154 self.dbWrapper, status, updateUDate)
1155 self.logger.debug(
"!!! URL is NOT MODIFIED. Update httpCode = %s, status = %s, updateUDate = %s",
1156 str(err.httpCode), str(status), str(updateUDate))
1158 self.updateURLForFailed(self.errorMask, err.httpCode, status, updateUDate)
1159 self.res = self.makeDefaultResponse(self.res, err.httpCode)
1161 except DatabaseException, err:
1162 self.errorMask = self.errorMask | APP_CONSTS.ERROR_DATABASE_ERROR
1163 self.updateURLForFailed(APP_CONSTS.ERROR_DATABASE_ERROR)
1164 self.res = self.makeDefaultResponse(self.res)
1166 except InternalCrawlerException, err:
1167 self.errorMask = self.errorMask | APP_CONSTS.ERROR_FETCHER_INTERNAL
1168 self.updateURLForFailed(APP_CONSTS.ERROR_FETCHER_INTERNAL)
1169 self.res = self.makeDefaultResponse(self.res)
1171 except Exception, err:
1172 self.errorMask = self.errorMask | APP_CONSTS.ERROR_GENERAL_CRAWLER
1173 self.updateURLForFailed(APP_CONSTS.ERROR_GENERAL_CRAWLER)
1174 ExceptionLog.handler(self.logger, err,
"Crawler fatal error.", (err), \
1175 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)