4 HCE project, Python bindings, Distributed Tasks Manager application. 5 Event objects definitions. 9 @author scorp <developers.hce@gmail.com>, Alexander Vybornyh <alexander.hce.cluster@gmail.com> 10 @copyright: Copyright © 2013-2017 IOIX Ukraine 11 @license: http://hierarchical-cluster-engine.com/license/ 20 import cPickle
as pickle
25 from time
import strftime
27 from collections
import namedtuple
43 from cement.core
import foundation
96 MSG_ERROR_LOAD_CONFIG =
"Error loading config file. Exciting. " 97 MSG_ERROR_LOAD_OPTIONS =
"Error loading options. Exciting. " 98 MSG_ERROR_LOAD_LOG_CONFIG_FILE =
"Can't load logging config file. Exiting. " 99 MSG_ERROR_LOAD_SITE_DATA =
"Can't load site data: " 100 MSG_ERROR_UPDATE_SITE_DATA =
"Can't update site data: " 101 MSG_ERROR_LOAD_URL_DATA =
"Can't load url data: " 102 MSG_ERROR_PROCESS_BATCH_ITEM =
"Can't process batch item " 103 MSG_ERROR_WRITE_CRAWLED_DATA =
"Can't write crawled data " 104 MSG_ERROR_COLLECT_URLS =
"Can't collect urls " 105 MSG_ERROR_ADD_URL_TO_BATCH_ITEM =
"Can't add url to batch item " 106 MSG_ERROR_LOAD_SITE_PROPERTIES =
"Can't load site properties " 107 MSG_ERROR_CRAWL_SITE =
"Can't crawl site " 108 MSG_ERROR_CHECK_SITE =
"Site don't passed check site " 109 MSG_ERROR_GET_DIR =
"Can't get dir " 110 MSG_ERROR_READ_SITE_FROM_DB =
"Can't read site data from db" 111 MSG_ERROR_EMPTY_RESPONSE_SIZE =
"Empty response" 112 MSG_ERROR_NOT_EXIST_ANY_VALID_PROXY =
"Not exist any valid proxy" 113 MSG_ERROR_EMPTY_CONFIG_FILE_NAME =
"Config file name is empty." 114 MSG_ERROR_WRONG_CONFIG_FILE_NAME =
"Config file name is wrong: %s" 115 MSG_ERROR_LOAD_APP_CONFIG =
"Error loading application config file. %s" 116 MSG_ERROR_EXTRACT_BASE_URL =
"Extract base url failed. Error: %s" 119 MSG_INFO_PROCESS_BATCH =
"ProcessBatch " 120 MSG_INFO_STORE_COOKIES_FILE =
"Store cookies file on disk." 122 MSG_DEBUG_NON_PROCESSING =
"ProcessorName is NONE. Exclude batch item from further processing." 124 SITE_MD5_EMPTY =
"d41d8cd98f00b204e9800998ecf8427e" 126 DEFAULT_MAX_SIZE = 1000000
127 EMPTY_RESPONSE_SIZE =
"0" 129 APP_NAME =
"crawler-task" 131 HTTP_COOKIE =
"HTTP_COOKIE" 132 DEFAULT_HTTP_COOKIE =
"" 133 HTTP_HEADERS =
"HTTP_HEADERS" 134 DEFAULT_HTTP_HEADER =
"" 136 DC_URLS_DB_NAME =
"dc_urls" 137 DC_URLS_TABLE_PREFIX =
"urls_" 138 DC_SITES_DB_NAME =
"dc_sites" 139 DC_SITES_PROPERTIES_TABLE_NAME =
"sites_properties" 140 DC_SITES_TABLE_NAME =
"sites" 141 DC_URLS_TABLE_NAME =
"urls" 142 COOKIES_FILE_POSTFIX =
".cookies.txt" 144 NON_PROCESSING =
"NONE" 146 HTTP_REDIRECT =
"<Response [301]>" 148 MAX_HTTP_REDIRECTS_UNLIMITED = 0
149 MAX_HTML_REDIRECTS_UNLIMITED = 0
150 META_XPATH =
"//meta[contains(@content, 'url')]/@content" 152 Results = namedtuple(
"Results",
"exit_code, output, err")
154 ROBOTS_PATTERN = re.compile(
r'(https?://[^/]+).*', re.I)
156 TEXT_CONTENT_TYPE_PATTERN = re.compile(
'text', re.I)
158 ENV_CRAWLER_STORE_PATH =
"ENV_CRAWLER_STORE_PATH" 164 DETECT_MIME_MAIN_CONTENT =
"1" 165 RECOVER_IF_FAILED =
"2" 176 errorMaskHttpCodeDict = { \
177 APP_CONSTS.ERROR_FETCH_INVALID_URL : SeleniumFetcher.ERROR_NAME_NOT_RESOLVED, \
178 APP_CONSTS.ERROR_FETCHER_INTERNAL : SeleniumFetcher.ERROR_FATAL, \
179 APP_CONSTS.ERROR_FETCHER_INTERNAL : SeleniumFetcher.ERROR_GENERAL, \
180 APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS : SeleniumFetcher.ERROR_TOO_MANY_REDIRECTS, \
181 APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR : SeleniumFetcher.ERROR_PROXY_CONNECTION_FAILED, \
182 APP_CONSTS.ERROR_FETCH_CONNECTION_TIMEOUT : SeleniumFetcher.ERROR_CONNECTION_TIMED_OUT, \
183 APP_CONSTS.ERROR_FETCH_FORBIDDEN : SeleniumFetcher.ERROR_TUNNEL_CONNECTION_FAILED, \
184 APP_CONSTS.ERROR_EMPTY_RESPONSE : SeleniumFetcher.ERROR_EMPTY_RESPONSE, \
185 APP_CONSTS.ERROR_FETCH_FORBIDDEN : SeleniumFetcher.ERROR_SERVICE_UNAVAILABLE, \
186 APP_CONSTS.ERROR_NOT_EXIST_ANY_VALID_PROXY : SeleniumFetcher.ERROR_PROXY_CONNECTION_FAILED, \
187 APP_CONSTS.ERROR_FETCH_HTTP_ERROR : SeleniumFetcher.ERROR_PROXY_CONNECTION_FAILED
191 DB_HOST_CFG_NAME =
"db_host" 192 DB_PORT_CFG_NAME =
"db_port" 193 DB_USER_CFG_NAME =
"db_user" 194 DB_PWD_CFG_NAME =
"db_pwd" 195 DB_SITES_CFG_NAME =
"db_dc_sites" 196 DB_URLS_CFG_NAME =
"db_dc_urls" 198 RAW_DATA_DIR =
"raw_data_dir" 199 SITE_TEMPLATES =
"dc_site_template" 200 KEY_VALUE_STORAGE_DIR =
"key_value_storage_dir" 201 DB_DATA_DIR =
"db_data_dir" 202 URL_SCHEMA_DIR =
"url_schema_data_dir" 203 URLS_XPATH_LIST_FILE =
"urls_xpath_list_file" 206 HOST_ALIVE_CHECK_NAME =
'HOST_ALIVE_CHECK' 207 HOST_ALIVE_CHECK_PROXY_NAME =
'HOST_ALIVE_CHECK_PROXY' 208 DEFAULT_PROTOCOL_PREFIX =
'http://' 210 SEARCH_BASE_URL_PATTERN =
r'<base[^>]+href="([^">]+)"' 224 foundation.CementApp.__init__(self)
247 self.
tidyOptions = {
'numeric-entities': 1,
'char-encoding':
'utf8'}
318 foundation.CementApp.setup(self)
325 foundation.CementApp.run(self)
343 self.
logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
353 host = Utils.UrlParser.getDomain(url)
354 if host
is not None and self.
siteHeaders is not None:
365 auth = urlparse.urlsplit(url.strip())[1]
366 host = re.search(
'([^@]*@)?([^:]*):?(.*)', auth).groups()[1]
375 collectURLsResult =
False 376 if True or self.
dom is not None:
379 collectURLs.url = self.
url 380 collectURLs.dom = self.
dom 381 collectURLs.realUrl = self.
realUrl 382 collectURLs.baseUrl = self.
batchItem.baseUrl
387 collectURLs.site = self.
site 395 self.
logger.debug(
"!!! self.batchItem.baseUrl = %s" , str(self.
batchItem.baseUrl))
397 if self.
batch.crawlerType != dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
404 proxyName = CrawlerTask.getProxyName(siteProperties=self.
siteProperties,
412 HTTPCookieResolver.STAGE_REGULAR)
414 collectURLsResult, internalLinks, externalLinks, urlObjects, self.
feedItems, chainUrls = \
416 not self.
batch.dbMode & dc_event.Batch.DB_MODE_W,
420 self.
logger.debug(
"!!! internalLinks (%s): %s", str(len(internalLinks)), str(internalLinks))
421 self.
logger.debug(
"!!! externalLinks (%s): %s", str(len(externalLinks)), str(externalLinks))
426 collectPropertiesObj.siteId = Utils.autoFillSiteId(self.
batchItem.siteId, self.
logger)
427 collectPropertiesObj.kvDbDir = self.
kvDbDir 428 collectPropertiesObj.res = self.
res 429 collectPropertiesObj.batchItem = self.
batchItem 430 collectPropertiesObj.realUrl = self.
realUrl 431 collectPropertiesObj.process(self.
dom, internalLinks, externalLinks)
434 if urlObjects
is not None and len(urlObjects) > 0
and \
437 if chainUrls
is not None and len(chainUrls) > 0:
446 self.
logger.info(
"self.dbWrapper is None")
452 self.
logger.info(
"Content not set on disk because a conent-type `%s` is not in MIME types list:\n%s",
464 if raw_unicode_content:
468 raw_unicode_content = raw_unicode_content.decode(self.
crawledResource.charset)
469 except Exception, err:
470 self.
logger.debug(
"Decoding content charset error, type: '" + str(
type(raw_unicode_content)) + \
471 "', length: " + str(len(raw_unicode_content)) +
" to charset: '" + \
473 putDict = {
'data': base64.b64encode(raw_unicode_content)}
474 contentType = dc_event.Content.CONTENT_RAW_CONTENT
475 urlPut_list.append(dc_event.URLPut(self.
batchItem.siteId, self.
batchItem.urlId, contentType, putDict))
485 tidy_content = tidylib.tidy_document(raw_unicode_content, self.
tidyOptions)[0]
490 except Exception, err:
491 self.
logger.debug(
"Decoding tidy content charset error, type: '" + str(
type(tidy_content)) + \
492 "', length: " + str(len(tidy_content)) +
" to charset: '" + \
494 putDict = {
'data': base64.b64encode(tidy_content)}
495 contentType = dc_event.Content.CONTENT_TIDY_CONTENT
496 urlPut_list.append(dc_event.URLPut(self.
batchItem.siteId, self.
batchItem.urlId, contentType, putDict))
500 putDict = {
'data': base64.b64encode(self.
crawledResource.binary_content)}
501 contentType = dc_event.Content.CONTENT_RAW_CONTENT
502 urlPut_list.append(dc_event.URLPut(self.
batchItem.siteId, self.
batchItem.urlId, contentType, putDict))
506 putDict = {
"data": base64.b64encode(self.
crawledResource.html_content)}
507 contentType = dc_event.Content.CONTENT_DYNAMIC_CONTENT
508 urlPut_list.append(dc_event.URLPut(self.
batchItem.siteId, self.
batchItem.urlId, contentType, putDict))
510 self.
logger.debug(
'!!! self.crawledResource.response_header = ' + str(self.
crawledResource.response_header))
518 putDict = {
"data": base64.b64encode(self.
crawledResource.response_header)}
519 contentType = dc_event.Content.CONTENT_HEADERS_CONTENT
520 urlPut_list.append(dc_event.URLPut(self.
batchItem.siteId, self.
batchItem.urlId, contentType, putDict))
524 putDict = {
"data": base64.b64encode(self.
crawledResource.html_request)}
525 contentType = dc_event.Content.CONTENT_REQUESTS_CONTENT
526 urlPut_list.append(dc_event.URLPut(self.
batchItem.siteId, self.
batchItem.urlId, contentType, putDict))
530 self.
dbWrapper.putURLContent(urlPut_list)
533 self.
url.contentMask = dc_event.URL.CONTENT_STORED_ON_DISK
534 self.
batchItem.urlObj.contentMask = dc_event.URL.CONTENT_STORED_ON_DISK
551 self.
logger.debug(
'!!! makeDir() enter .... self.dir = ' + str(self.
dir))
552 if not os.path.exists(self.
dir):
553 os.makedirs(self.
dir)
555 if not os.path.isdir(self.
dir):
558 raise Exception(
"path %s exists, but is not a directory" % (self.
dir,))
564 def updateURLForFailed(self, errorBit, httpCode=CONSTS.HTTP_CODE_400, status=dc_event.URL.STATUS_CRAWLED, \
585 def httpRequestWrapper(self, url, headers, auth, postData, urlObj, incomingContent, macroCode=None, proxyName=None):
588 localFetchType = copy.deepcopy(self.
site.fetchType)
589 if localFetchType == BaseFetcher.TYP_DYNAMIC:
590 if urlObj
is not None and urlObj.parentMd5
is not None and \
593 if urlObj.parentMd5 ==
"":
594 localFetchType = BaseFetcher.TYP_DYNAMIC
596 localFetchType = BaseFetcher.TYP_NORMAL
597 elif localFetchType == BaseFetcher.TYP_AUTO:
598 localFetchType = BaseFetcher.TYP_NORMAL
601 if urlObj
is not None and urlObj.parentMd5
is not None and urlObj.parentMd5 ==
"" and \
602 self.
processorName == PCONSTS.PROCESSOR_RSS
and localFetchType == BaseFetcher.TYP_DYNAMIC:
603 localFetchType = BaseFetcher.TYP_NORMAL
606 fetchResType = FetcherType.getFromProperty(self.
siteProperties[
"FETCHER_TYPE"], urlObj.url, self.
logger)
607 if fetchResType
is not None:
608 localFetchType = fetchResType
611 self.
logger.debug(
">>> FetchType before applying = " + str(localFetchType))
613 self.
logger.debug(
">>> self.detectModified.modifiedSettings = " + str(self.
detectModified.modifiedSettings))
614 self.
logger.debug(
">>> self.urlProcess.urlObj.lastModified = " + str(self.
urlProcess.urlObj.lastModified))
617 (urlObj
is not None and urlObj.crawled == 0):
618 if incomingContent
is None:
629 fetcher = BaseFetcher.get_fetcher(localFetchType, self.
dbWrapper, self.
site.id)
631 fetcher.connectionTimeout = float(self.
siteProperties[
"CONNECTION_TIMEOUT"])
633 fetcher.connectionTimeout = CONSTS.CONNECTION_TIMEOUT
637 self.
logger.debug(
'self.external_url: ' + str(self.
external_url) +
' url: ' + str(url))
639 url = self.
external_url.replace(
'%URL%', urllib.quote(url))
640 self.
logger.debug(
'New url: ' + str(url))
641 tm = int(self.
url.httpTimeout) / 1000.0
642 if isinstance(self.
url.httpTimeout, float):
643 tm += float(
'0' + str(self.
url.httpTimeout).strip()[str(self.
url.httpTimeout).strip().find(
'.'):])
645 cookieStage = HTTPCookieResolver.STAGE_REDIRECT
647 cookieStage = cookieStage | HTTPCookieResolver.STAGE_RSS
650 self.
logger.debug(
"!!! Before fetcher.open() for url: %s", str(url))
651 self.
logger.debug(
"!!! Before fetcher.open() self.site.maxResourceSize = %s", str(self.
site.maxResourceSize))
653 ret = fetcher.open(url, timeout=tm, headers=headers,
657 filters=
None if urlObj.parentMd5 ==
"" else self.
site.filters,
658 depth=urlObj.depth, macro=macroCode)
661 fetcher = BaseFetcher.get_fetcher(BaseFetcher.TYP_CONTENT, self.
dbWrapper, self.
site.id)
663 fetcher.connectionTimeout = float(self.
siteProperties[
"CONNECTION_TIMEOUT"])
665 fetcher.connectionTimeout = CONSTS.CONNECTION_TIMEOUT
666 ret = fetcher.open(url, inputContent=incomingContent, log=self.
logger)
674 httpParams[
"url"] = url
676 tm = int(self.
url.httpTimeout) / 1000.0
677 if isinstance(self.
url.httpTimeout, float):
678 tm += float(
'0' + str(self.
url.httpTimeout).strip()[str(self.
url.httpTimeout).strip().find(
'.'):])
679 httpParams[
"httpTimeout"] = tm
680 httpParams[
"httpHeader"] = headers
682 httpParams[
"proxies"] = self.
proxies 683 httpParams[
"auth"] = auth
684 httpParams[
"postData"] = postData
686 httpParams[
"maxResourceSize"] = self.
site.maxResourceSize
688 ret = self.
detectModified.makeHTTPRequest(localFetchType, httpParams)
691 self.
logger.debug(
"!!! self.detectModified.isNotModified() ret.status_code: %s !!!", str(ret.status_code))
711 for h
in rotatedHeaders:
720 if "REFERER_SELF_URL" in self.
siteProperties else RefererHeaderResolver.MODE_SIMPLE, url,
730 self.
logger.debug(
"Make request delay " + str(self.
url.requestDelay / 1000.0) +
" sec.")
731 time.sleep(self.
url.requestDelay / 1000.0)
734 self.
logger.debug(
"!!! self.url.url = '%s'", str(self.
url.url))
741 startTime = time.time()
745 self.
logger.debug(
"!!! url = '%s'", str(url))
752 except Exception, err:
753 self.
logger.
error(
"Initialization of macro error: %s, source: %s", str(err),
780 self.
logger.debug(
"!!! urlTemplateApply() return url = '%s'", str(url))
783 if not CrawlerTask.isAvailableUrl(siteProperties=self.
siteProperties, url=url, logger=self.
logger):
784 self.
logger.debug(
"Host '%s' is not available!", str(url))
789 self.
logger.debug(
"Robots.txt obey mode ON")
792 proxyName = CrawlerTask.getProxyName(siteProperties=self.
siteProperties,
801 HTTPCookieResolver.STAGE_ROBOTS)
806 self.
logger.debug(
">>> URL " + url +
" is NOT Allowed by user-agent:" + str(retUserAgent))
817 self.
logger.debug(
"!!! localUrl = '%s'", str(localUrl))
819 retriesCount = HTTPProxyResolver.getTriesCount(self.
siteProperties)
822 for count
in range(0, retriesCount + 1):
823 self.
logger.debug(
"retriesCount = %s, count = %s", str(retriesCount), str(count))
825 HTTPProxyResolver.checkTriesCount(siteProperties=self.
siteProperties, currentTriesCount=proxyTriesCount)
828 proxyName = CrawlerTask.getProxyName(siteProperties=self.
siteProperties,
835 if proxyName
is not None:
840 self.
logger.info(
"start to fetch: %s", localUrl)
843 except SeleniumFetcherException, err:
844 self.
logger.debug(
"!!! httpRequestWrapper return error: %s", str(err))
852 if CrawlerTask.isNeedRotateProxy(siteProperties=self.
siteProperties,
856 rawContent=res.rendered_unicode_content):
857 self.
logger.debug(
'Necessary rotate proxy. Go to the next...')
860 if res
is not None and res.error_mask != APP_CONSTS.ERROR_OK:
861 self.
logger.debug(
"res.error_mask = %s", str(res.error_mask))
872 elif res.rendered_unicode_content
is not None:
873 if 'content-type' in res.headers
and res.headers[
'content-type'].find(
'text/html') > -1:
876 if self.
site.fetchType == BaseFetcher.TYP_DYNAMIC:
877 res.rendered_unicode_content = Utils.eraseNoScript(res.rendered_unicode_content)
880 localUrl = Utils.getHTMLRedirectUrl(res.rendered_unicode_content, self.
logger)
881 except Exception, err:
883 self.
logger.info(Utils.getTracebackInfo())
885 self.
logger.debug(
"!!! HTML redirect to '%s'", str(localUrl))
886 if localUrl
is None or localUrl ==
'':
888 elif res.status_code != CONSTS.HTTP_CODE_200
and res.status_code
not in CONSTS.REDIRECT_HTTP_CODES:
889 self.
logger.debug(
"!!! Url skipped, because http code = '%s'", str(res.status_code))
895 isAllowedByFilter = collectURLs.filtersApply(self.
site.filters, localUrl, 0, self.
dbWrapper, \
897 Filters.OC_RE, Filters.STAGE_COLLECT_URLS)
898 if not isAllowedByFilter:
899 localUrl = urlparse.urljoin(prevUrl, localUrl)
901 localUrl = dc_event.URL(0, localUrl, normalizeMask=self.
normMask).getURL(self.
normMask)
902 self.
logger.debug(
"HTML redirect: %s, is allowed by filters: %s", localUrl, str(bool(isAllowedByFilter)))
911 if res
is not None and res.error_mask != 0:
912 self.
logger.debug(
"Positive res.error_mask: %s", str(res.error_mask))
917 if res
is not None and res.headers
is not None and "content-length" in res.headers
and \
918 res.headers[
"content-length"] == EMPTY_RESPONSE_SIZE:
919 self.
logger.debug(
'Zero content-length!')
929 self.
logger.info(
"!!! response code: '%s'", str(self.
res.status_code))
930 self.
logger.info(
"!!! response cookies: '%s'", str(self.
res.cookies))
941 self.
logger.debug(
"!!! Found property 'REPLACE' !!!")
943 self.
res.rendered_unicode_content = ContentEvaluator.executeReplace(
946 propertyString=self.
siteProperties[APP_CONSTS.REPLACEMENT_CONTENT_DATA],
947 contentData=self.
res.rendered_unicode_content)
949 self.
res.content_size = len(self.
res.rendered_unicode_content)
955 resource.dynamic_fetcher_type = res.dynamic_fetcher_type
956 resource.dynamic_fetcher_result_type = res.dynamic_fetcher_result_type
968 self.
errorMask = APP_CONSTS.ERROR_FETCH_FORBIDDEN
997 self.
logger.debug(
"+++++++++++++++++++++++++++++++++++++")
998 self.
logger.debug(
"Block handlers 'STAGE_BEFORE_DOM_PRE'")
1003 None, Filters.OC_RE, Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_RAW_CONTENT)
1005 self.
logger.debug(
'>>> localFilters.filters: ' +
varDump(localFilters.filters))
1006 self.
logger.debug(
">>> isExistStage('STAGE_BEFORE_DOM_PRE'): " + \
1007 str(localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE)))
1009 if localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE):
1010 self.
logger.debug(
"Check RAW content text regular expression ...")
1011 if collectURLs.filtersApply(
None, resource.binary_content, 0, self.
dbWrapper,
1012 self.
batchItem.siteId,
None, Filters.OC_RE,
1013 Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_RAW_CONTENT,
True):
1014 self.
logger.debug(
"RAW content text regular expression check SUCCESS")
1016 self.
logger.debug(
"RAW content text regular expression check FAILED")
1024 None, Filters.OC_RE, Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_HEADERS_ALL)
1026 self.
logger.debug(
'>>> localFilters.filters: ' +
varDump(localFilters.filters))
1027 self.
logger.debug(
">>> isExistStage('STAGE_BEFORE_DOM_PRE'): " + \
1028 str(localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE)))
1030 if localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE):
1031 self.
logger.debug(
"Check HTTP headers by name text regular expression check ...")
1032 if collectURLs.filtersApply(
None, resource.response_header, 0, self.
dbWrapper,
1033 self.
batchItem.siteId,
None, Filters.OC_RE,
1034 Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_HEADERS_ALL,
True):
1035 self.
logger.debug(
"HTTP headers by name text regular expression check SUCCESS")
1037 self.
logger.debug(
"HTTP headers by name text regular expression check FAILED")
1044 self.
logger.debug(
"Check Last modified datetime value date comparison check ...")
1045 self.
logger.debug(
'resource.last_modified = ' + str(resource.last_modified))
1048 {
'PDATE':str(resource.last_modified)}, Filters.OC_SQLE, Filters.STAGE_BEFORE_DOM_PRE,
1049 Filters.SELECT_SUBJECT_LAST_MODIFIED)
1051 self.
logger.debug(
'>>> localFilters.filters: ' +
varDump(localFilters.filters))
1052 self.
logger.debug(
">>> isExistStage('STAGE_BEFORE_DOM_PRE'): " + \
1053 str(localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE)))
1055 if localFilters.isExistStage(Filters.STAGE_BEFORE_DOM_PRE):
1057 {
'PDATE':str(resource.last_modified)}, Filters.OC_SQLE,
1058 Filters.STAGE_BEFORE_DOM_PRE, Filters.SELECT_SUBJECT_LAST_MODIFIED,
True):
1059 self.
logger.debug(
"Last modified datetime value date comparison check SUCCESS")
1061 self.
logger.debug(
"Last modified datetime value date comparison check FAILED")
1068 except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
1073 except requests.exceptions.InvalidURL:
1078 except requests.exceptions.TooManyRedirects:
1083 except requests.exceptions.ChunkedEncodingError:
1088 except requests.exceptions.ConnectionError:
1093 except requests.exceptions.ContentDecodingError:
1098 except lxml.etree.XMLSyntaxError:
1099 self.
logger.debug(
"XML HTML syntax error")
1104 except ProxyException, err:
1105 self.
logger.debug(
'self.errorMask = ' + str(self.
errorMask) +
' err.code = ' + str(err.code) + \
1106 ' err.statusUpdate = ' + str(err.statusUpdate))
1107 status = dc_event.URL.STATUS_CRAWLED
1108 if err.statusUpdate
is not None:
1109 status = err.statusUpdate
1110 self.
logger.debug(
'Set status update = ' + str(status))
1115 except SeleniumFetcherException, err:
1116 self.
logger.
error(
"Selenium fetcher error: " + str(err) +
' code = ' + str(err.code))
1117 httpCode = CONSTS.HTTP_CODE_400
1124 except UrlAvailableException, err:
1129 except requests.exceptions.HTTPError, err:
1134 except requests.exceptions.URLRequired, err:
1139 except requests.exceptions.RequestException, err:
1144 except CrawlerFilterException, err:
1149 except NotModifiedException, err:
1150 status = dc_event.URL.STATUS_CRAWLED
1155 self.
logger.debug(
"!!! URL is NOT MODIFIED. Update httpCode = %s, status = %s, updateUDate = %s",
1156 str(err.httpCode), str(status), str(updateUDate))
1161 except DatabaseException, err:
1166 except InternalCrawlerException, err:
1171 except Exception, err:
1174 ExceptionLog.handler(self.
logger, err,
"Crawler fatal error.", (err), \
1175 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1188 if response
is None:
1191 response.status_code = httpCode
1192 response.unicode_content =
"" 1193 response.str_content =
"" 1194 response.rendered_unicode_content =
"" 1195 response.content_size = 0
1196 response.encoding =
"" 1197 response.headers = {
"content-length": 0,
"content-type":
""}
1198 response.meta_res =
"" 1205 if self.
res.status_code == CONSTS.HTTP_CODE_304:
1210 / (self.
site.avgSpeedCounter + 1)
1211 self.
site.avgSpeed = avgSpeed
1212 self.
site.avgSpeedCounter += 1
1215 localSiteUpdate = dc_event.SiteUpdate(self.
batchItem.siteId)
1216 for attr
in localSiteUpdate.__dict__:
1217 if hasattr(localSiteUpdate, attr):
1218 setattr(localSiteUpdate, attr,
None)
1219 localSiteUpdate.id = self.
batchItem.siteId
1220 localSiteUpdate.avgSpeed = avgSpeed
1221 localSiteUpdate.avgSpeedCounter =
SQLExpression(
"`AVGSpeedCounter` + 1")
1222 self.
dbWrapper.siteNewOrUpdate(localSiteUpdate)
1227 self.
logger.debug(MSG_INFO_STORE_COOKIES_FILE)
1230 timePostfix = datetime.datetime.now().strftime(
"%Y%m%d%H%M%S")
1238 if RequestsRedirectWrapper.RESPONSE_COOKIE_HEADER_NAME
in self.
res.headers:
1240 cookies_str = self.
res.headers[RequestsRedirectWrapper.RESPONSE_COOKIE_HEADER_NAME]
1244 cookies_str =
''.
join([key +
": " + value +
"; " for (key, value)
in self.
crawledResource.cookies.items()])
1247 self.
logger.debug(
"Response cookies string: %s", str(cookies_str))
1248 self.
logger.debug(
"self.batchItem.urlId: %s", str(self.
batchItem.urlId))
1249 self.
logger.debug(
"timePostfix: %s", str(timePostfix))
1250 if timePostfix ==
"":
1251 base_path = os.path.join(self.
dir, self.
batchItem.urlId)
1253 base_path = os.path.join(self.
dir, self.
batchItem.urlId +
"_" + str(timePostfix))
1254 cookies_file_name = base_path + COOKIES_FILE_POSTFIX
1255 with open(cookies_file_name,
"wb")
as f:
1257 f.write(cookies_str)
1288 keys = [localProperty[
"name"]
for localProperty
in self.
site.properties]
1290 for key
in self.
batchItem.properties.keys():
1292 for localProperty
in self.
site.properties:
1293 if localProperty[
"name"] == key:
1294 self.
logger.debug(
"%s present in site properties. Rewrite localProperty" % key)
1295 localProperty[
"value"] = self.
batchItem.properties[key]
1297 self.
logger.debug(
"%s not present in site properties. Add localProperty" % key)
1298 self.
site.properties.append({
"name":key,
"value":self.
batchItem.properties[key],
1302 for item
in self.
site.properties:
1316 if allowMimes
is not None and allowMimes !=
'' and allowMimes !=
'*':
1317 self.
needStoreMime = set([mime.lower()
for mime
in allowMimes.split(
',')])
1323 if key.startswith(
'HTTP_POST_FORM_'):
1337 if DC_CONSTS.SITE_PROP_AUTO_REMOVE_RESOURCES
in self.
siteProperties:
1346 if DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE_ACTIVE
in self.
siteProperties:
1348 self.
siteProperties[DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE_ACTIVE]
1388 if cookie
is not None and cookie !=
"":
1389 if cookie.lower().startswith(
'cookie:'):
1390 self.
cookie = cookie[len(
'cookie:'):]
1395 for header
in self.
headers.splitlines():
1399 key, value = header[:header.index(
':')].strip(), header[header.index(
':') + len(
':'):].strip()
1401 self.
logger.debug(
"header:%s", header)
1410 except Exception, err:
1411 ExceptionLog.handler(self.
logger, err, MSG_ERROR_LOAD_SITE_PROPERTIES)
1412 self.
errorMask |= APP_CONSTS.ERROR_CRAWLER_FATAL_INITIALIZATION_PROJECT_ERROR
1420 if lh.startswith(
'file://'):
1421 lh = Utils.loadFromFileByReference(fileReference=lh, loggerObj=self.
logger)
1422 self.
siteHeaders = {
'User-Agent': [h.strip()
for h
in lh.split(
"\n")
if len(h) > 0
and h[0] !=
'#']}
1428 self.
siteHeaders[lh] = Utils.loadFromFileByReference(fileReference=lh, loggerObj=self.
logger)
1450 with open(path,
'r') as f: 1451 return ''.
join(f.readlines())
1464 localSiteUpdate = dc_event.SiteUpdate(self.
batchItem.siteId)
1465 for attr
in localSiteUpdate.__dict__:
1466 if hasattr(localSiteUpdate, attr):
1467 setattr(localSiteUpdate, attr,
None)
1472 localSiteUpdate.errorMask =
SQLExpression((
"`ErrorMask` | %s" % mask))
1474 localSiteUpdate.state = Site.STATE_SUSPENDED
1476 localSiteUpdate.id = self.
batchItem.siteId
1477 localSiteUpdate.updateType = dc_event.SiteUpdate.UPDATE_TYPE_UPDATE
1478 updated_count = self.
dbWrapper.siteNewOrUpdate(siteObject=localSiteUpdate, stype=dc_event.SiteUpdate)
1479 if updated_count > 0:
1481 except DatabaseException, err:
1482 ExceptionLog.handler(self.
logger, err, MSG_ERROR_UPDATE_SITE_DATA, (err))
1484 except Exception, err:
1485 ExceptionLog.handler(self.
logger, err, MSG_ERROR_UPDATE_SITE_DATA, (err))
1501 if self.
site is not None and APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER
in self.
siteProperties:
1503 localSiteUpdate = dc_event.SiteUpdate(self.
batchItem.siteId)
1504 for attr
in localSiteUpdate.__dict__:
1505 if hasattr(localSiteUpdate, attr):
1506 setattr(localSiteUpdate, attr,
None)
1511 APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER)
1513 for name, value
in changedFieldsDict.items():
1514 if hasattr(localSiteUpdate, name)
and value
is not None and name
not in [
'CDate',
'UDate',
'tcDate']:
1515 setattr(localSiteUpdate, name, value)
1517 localSiteUpdate.errorMask =
SQLExpression((
"`ErrorMask` | %s" % self.
site.errorMask))
1518 localSiteUpdate.id = self.
batchItem.siteId
1519 localSiteUpdate.updateType = dc_event.SiteUpdate.UPDATE_TYPE_UPDATE
1521 updatedCount = self.
dbWrapper.siteNewOrUpdate(siteObject=localSiteUpdate, stype=dc_event.SiteUpdate)
1522 self.
logger.debug(
'Updated ' + str(updatedCount) +
' rows.')
1523 if updatedCount > 0:
1525 except DatabaseException, err:
1526 self.
logger.
error(
"Update 'Site' failed, error: %s", str(err))
1528 except Exception, err:
1529 self.
logger.
error(
"Update 'Site' failed, error: %s", str(err))
1539 if self.
site is None or urlObj
is None:
1540 self.
logger.
error(
'Error: self.site or urlObj is None!')
1544 if((self.
batch.crawlerType != dc_event.Batch.TYPE_REAL_TIME_CRAWLER)
and (self.
site.state != Site.STATE_ACTIVE)) \
1545 or ((self.
batch.crawlerType == dc_event.Batch.TYPE_REAL_TIME_CRAWLER)
and (self.
site.state == Site.STATE_DISABLED)):
1546 self.
logger.debug(
"Warning: Batch CrawlerType: %s, site state is %s but is not STATE_ACTIVE!" 1547 % (str(self.
batch.crawlerType), str(self.
site.state)))
1552 if self.
site.maxErrors > 0
and self.
site.errors > self.
site.maxErrors:
1553 self.
logger.debug(
"Site max errors: %s limit: %s is reached", str(self.
site.errors), str(self.
site.maxErrors))
1571 if urlString
is None:
1572 if urlObj.parentMd5 ==
'':
1590 if batch.crawlerType != dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
1595 self.
logger.debug(
"Site not found. Assume a site id as: `0`")
1601 self.
site = dc_event.Site(
"")
1605 if self.
site is not None and self.
batchItem.siteObj
is not None:
1608 except Exception
as err:
1609 ExceptionLog.handler(self.
logger, err, MSG_ERROR_LOAD_SITE_DATA, (err))
1610 self.
errorMask |= APP_CONSTS.ERROR_CRAWLER_FATAL_INITIALIZATION_PROJECT_ERROR
1618 siteStatus = dc_event.SiteStatus(self.
batchItem.siteId)
1619 drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.SITE_STATUS, siteStatus)
1624 self.
site = responseDRCESyncTasksCover.eventObject
1625 except Exception
as err:
1626 ExceptionLog.handler(self.
logger, err, MSG_ERROR_READ_SITE_FROM_DB, (err))
1639 self.
urlTable = DC_URLS_TABLE_PREFIX +
"0" 1699 self.
logger.debug(
"set siteId = '" + str(self.
urlProcess.siteId) +
"' from 'updateBatchItemAfterCarwling'")
1703 self.
logger.debug(
'!!!!!! HTTP_CODE_STATUS_UPDATE !!!!! ')
1705 status = dc_event.URL.STATUS_CRAWLED
1707 statusDict = json.loads(self.
siteProperties[
"HTTP_CODE_STATUS_UPDATE"])
1708 self.
logger.debug(
'!!!!!! statusDict: ' + str(statusDict))
1710 self.
logger.debug(
"Change status from (%s) to (%s), because http_code = %s", str(status), \
1713 except Exception, err:
1714 self.
logger.
error(
"Load property 'HTTP_CODE_STATUS_UPDATE' has error: " + str(err))
1716 if status
is not None:
1717 self.
batchItem.urlObj.status = dc_event.URL.STATUS_CRAWLED
1724 if self.
batchItem.urlObj.contentType ==
"":
1729 self.
batchItem.urlObj.CDate = self.
batchItem.urlObj.UDate = datetime.datetime.now().strftime(
"%Y-%m-%d %H:%M:%S")
1744 self.
logger.info(
"Batch item START, siteId: %s, urlId: %s, self.batch.crawlerType: %s",
1746 isResourceCrawled =
False 1749 mpLogger = Utils.MPLogger()
1761 self.
logger.info(
"Switch logger to dedicated project %s log", str(self.
batchItem.siteId))
1768 initFiends(
None, (bool(int(self.
siteProperties[
'ROBOTS_CACHE']))
if \
1772 if self.
batch.crawlerType == dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
1773 if self.
batchItem.urlObj.urlPut
is not None:
1774 self.
batchItem.urlObj.contentMask = dc_event.URL.CONTENT_STORED_ON_DISK
1776 if "data" in self.
batchItem.urlObj.urlPut.putDict:
1777 raw_unicode_content = self.
batchItem.urlObj.urlPut.putDict[
"data"]
1781 domParser(
None, raw_unicode_content, CONSTS.HTTP_CODE_200,
1785 tidy_content = tidylib.tidy_document(raw_unicode_content, self.
tidyOptions)[0]
1786 self.
batchItem.urlObj.urlPut.putDict[
"data"] = base64.b64encode(tidy_content)
1788 if self.
batch.dbMode & dc_event.Batch.DB_MODE_W == 0:
1794 self.
logger.debug(
"Common checks failed!")
1801 if hostRequestStorage.checkHost(
None, self.
url.url, self.
batchItem.siteId) == HostRequestStorage.ITEM_BREAK:
1802 self.
logger.debug(
">>> Skip url [%s] by http requests freq", self.
url.url)
1803 self.
url.status = dc_event.URL.STATUS_NEW
1804 self.
url.errorMask = APP_CONSTS.ERROR_NO_TIME_WINDOW
1805 self.
url.httpCode = CONSTS.HTTP_CODE_400
1810 if nextStep
and self.
batch.crawlerType == dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
1812 if self.
url.crawled > 0
and self.
url.errorMask == APP_CONSTS.ERROR_OK
and self.
url.tagsCount > 0
and\
1814 self.
logger.debug(
"RealTime Crawling: Cashed resource. Resource crawled and error mask is empty")
1815 if PCONSTS.RECRAWL_KEY
not in self.
batchItem.properties
or \
1816 int(self.
batchItem.properties[PCONSTS.RECRAWL_KEY]) == PCONSTS.RECRAWL_VALUE_NO:
1817 self.
logger.debug(
"Item not need to be recrawled.")
1819 self.
batchItem.urlObj.contentMask = dc_event.URL.CONTENT_STORED_ON_DISK
1823 self.
logger.debug(
"Property `recrawl` = %s. Item recrawling." % \
1824 str(self.
batchItem.properties[PCONSTS.RECRAWL_KEY]))
1832 self.
logger.debug(
'Start to crawl item')
1833 isResourceCrawled = self.
crawl(self.
batchItem.urlObj.urlPut.putDict[
"data"]
if \
1834 (self.
batchItem.urlObj.urlPut
is not None and \
1835 self.
batchItem.urlObj.urlPut.putDict
is not None and "data" in \
1836 self.
batchItem.urlObj.urlPut.putDict)
else None)
1838 self.
logger.debug(
"After crawl() isResourceCrawled: %s", str(isResourceCrawled))
1844 if self.
batchItem.urlObj.type == dc_event.URL.TYPE_SINGLE:
1845 self.
logger.debug(
'URL type single, do not pars and build DOM, set self.dom = None')
1848 self.
logger.debug(
'Build DOM, call domParser()\n self.crawledResource.charset = ' + \
1855 if self.
dom is None:
1856 self.
errorMask |= APP_CONSTS.ERROR_PARSE_ERROR
1860 self.
logger.debug(
'>>>>> self.crawledResource.last_modified = ' + str(self.
crawledResource.last_modified))
1862 self.
logger.debug(
'>>> Before getPubdateUseSourceMask() self.batchItem.urlObj.pDate = ' + \
1867 self.
logger.debug(
'>>> After getPubdateUseSourceMask() self.batchItem.urlObj.pDate = ' + \
1875 self.
logger.debug(
"!!! parentUrl: %s", str(parentUrl))
1876 self.
logger.debug(
"!!! self.url.parentMd5: %s", str(self.
url.parentMd5))
1877 self.
logger.debug(
"!!! self.url.url: %s", str(self.
url.url))
1878 self.
logger.debug(
"!!! self.feedUrl: %s", str(self.
feedUrl))
1880 if self.
url.parentMd5 ==
"":
1881 self.
feedUrl[hashlib.md5(self.
url.url).hexdigest()] = self.
url.url
1882 elif self.
url.parentMd5 !=
"" and parentUrl
is None:
1886 if parentUrl
is not None and parentUrl !=
"":
1899 except Exception, err:
1900 self.
logger.
error(
"saveCookies fail: %s\n%s", str(err), Utils.getTracebackInfo())
1907 except Exception, err:
1908 self.
errorMask |= APP_CONSTS.ERROR_WRITE_FILE_ERROR
1910 self.
logger.
error(MSG_ERROR_WRITE_CRAWLED_DATA +
': ' + str(err))
1915 self.
logger.debug(
"Enter in collectURLs()")
1921 self.
logger.debug(
">>> ChainId, update URL without status")
1929 if self.
res is not None:
1938 if APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER
in self.
siteProperties:
1941 except DatabaseException, err:
1942 self.
errorMask |= APP_CONSTS.ERROR_DATABASE_ERROR
1943 ExceptionLog.handler(self.
logger, err, MSG_ERROR_PROCESS_BATCH_ITEM, (err))
1944 except SyncronizeException, err:
1945 self.
errorMask |= APP_CONSTS.ERROR_SYNCHRONIZE_URL_WITH_DB
1946 ExceptionLog.handler(self.
logger, err, MSG_ERROR_PROCESS_BATCH_ITEM, (err), \
1947 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1948 except Exception
as err:
1949 ExceptionLog.handler(self.
logger, err, MSG_ERROR_PROCESS_BATCH_ITEM, (err))
1953 self.
logger.info(
"Switch logger back to default from dedicated for project %s", str(self.
batchItem.siteId))
1955 self.
logger.info(
"Switched logger back to default from dedicated for project %s", str(self.
batchItem.siteId))
1959 self.
logger.debug(
'>>>>> Before self.dbWrapper.fieldsRecalculating([self.batchItem.siteId])')
1968 input_pickled_object = sys.stdin.read()
1970 input_batch = pickle.loads(input_pickled_object)
1972 if input_batch.crawlerType != dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
1976 app.Profiler.messagesList.append(
"Batch.id: " + str(input_batch.id))
1977 self.
logger.info(
"Input batch id: %s, items: %s", str(input_batch.id), str(len(input_batch.items)))
1979 self.
logger.debug(
"len before (batch_items)=%i", len(input_batch.items))
1981 self.
logger.debug(
"len after (batch_items)=%i", len(input_batch.items))
1983 if int(input_batch.maxExecutionTime) > 0:
1988 self.
logger.debug(
"Set maxExecutionTime = %s, removeUnprocessedItems = %s",
1991 self.
batch = input_batch
1992 Utils.storePickleOnDisk(input_pickled_object, ENV_CRAWLER_STORE_PATH,
"crawler.in." + str(self.
batch.id))
1995 maxBatchIterations = input_batch.maxIterations
1999 self.
logger.debug(
"maxBatchIterations=%s", str(maxBatchIterations))
2000 if self.
batch.dbMode & dc_event.Batch.DB_MODE_W == 0:
2007 for index, batchItem
in enumerate(self.
batch.items):
2010 self.
logger.debug(
"Maximum execution time %ss reached, process batch items loop interrupted!",
2012 self.
errorMask = APP_CONSTS.ERROR_MAX_EXECUTION_TIME
2016 batch_items.append(batchItem)
2019 if batchItem.urlObj.status == dc_event.URL.STATUS_NEW
or \
2020 batchItem.urlObj.status == dc_event.URL.STATUS_SELECTED_CRAWLING
or \
2021 batchItem.urlObj.status == dc_event.URL.STATUS_SELECTED_CRAWLING_INCREMENTAL:
2022 self.
errorMask = batchItem.urlObj.errorMask
2023 batchItem.urlObj.status = dc_event.URL.STATUS_CRAWLING
2025 self.
logger.debug(
'========== log flush for batchId = ' + str(self.
batch.id) +
2026 ' batchItem index = ' + str(index))
2027 Utils.loggerFlush(self.
logger)
2029 self.
logger.debug(
'========== log flush for batchId = ' + str(self.
batch.id))
2030 Utils.loggerFlush(self.
logger)
2036 self.
logger.debug(
'!!! Before self.updateBatchItem(self.batchItem)')
2041 self.
logger.debug(
'!!! Before self.updateBatchItem(batchItem)')
2045 batch_items.append(batchItem)
2055 self.
logger.debug(
"Exit from batching iteration:" + \
2056 "self.curBatchIterations=%s, maxBatchIterations=%s, beforeItems=%s, self.store_items=%s",
2066 if input_batch.crawlerType == dc_event.Batch.TYPE_REAL_TIME_CRAWLER:
2067 process_task_batch = input_batch
2070 process_task_batch =
Batch(input_batch.id, batch_items)
2071 process_task_batch.errorMask |= self.
errorMask 2073 if self.
processorName == PCONSTS.PROCESSOR_RSS
and len(process_task_batch.items) == 0
and \
2075 self.
logger.debug(
"RSS empty!")
2077 self.
batchItem.urlObj.errorMask |= APP_CONSTS.ERROR_RSS_EMPTY
2078 process_task_batch.items.append(self.
batchItem)
2081 self.
logger.info(
"Out batch id: %s, items: %s", str(process_task_batch.id), str(len(process_task_batch.items)))
2082 output_pickled_object = pickle.dumps(process_task_batch)
2083 Utils.storePickleOnDisk(output_pickled_object, ENV_CRAWLER_STORE_PATH,
"crawler.out." + str(self.
batch.id))
2084 sys.stdout.write(output_pickled_object)
2086 except Exception, err:
2087 ExceptionLog.handler(self.
logger, err,
'Batch processing failed!', (err))
2095 additionBatchItems = []
2096 for batchItem
in batch.items:
2104 self.
logger.debug(
"batchItem.urlObj.url: " + str(batchItem.urlObj.url))
2106 self.
logger.debug(
'>>> len(additionBatchItems) = ' + str(len(additionBatchItems)) + \
2110 self.
logger.debug(
"url: " + str(elem.urlObj.url))
2114 for elem
in additionBatchItems:
2115 if elem.urlId
not in [e.urlId
for e
in tmpBatchItems]:
2116 tmpBatchItems.append(elem)
2118 batch.items += tmpBatchItems
2120 self.
logger.debug(
"len(batch.items) = " + str(len(batch.items)) + \
2121 " len(tmpBatchItems) = " + str(len(tmpBatchItems)))
2124 if batch.maxItems
is not None and int(batch.maxItems) < len(batch.items):
2125 batch.items = batch.items[0: self.
batch.maxItems]
2126 batch.items[-1].urlObj.errorMask |= APP_CONSTS.ERROR_MAX_ITEMS
2137 newUrls = urlSchema.generateUrlSchema(batchItem.urlObj.url)
2138 self.
logger.debug(
"Generated new urls count = %s", str(len(newUrls)))
2141 if self.
site.maxURLs > 0
and len(newUrls) >= self.
site.maxURLs:
2142 newUrls = set(list(newUrls)[:self.
site.maxURLs])
2143 self.
logger.debug(
"Site maxURLs = %s limit reached.", str(self.
site.maxURLs))
2145 if self.
site.maxResources > 0
and len(newUrls) >= self.
site.maxResources:
2146 newUrls = set(list(newUrls)[:self.
site.maxResources])
2147 self.
logger.debug(
"Site maxResources = %s limit reached.", str(self.
site.maxResources))
2150 if len(newUrls) > 0:
2151 self.
logger.debug(
"Url was changed. From %s to %s", batchItem.urlObj.url, newUrls[0])
2154 urlUpdateObj = dc_event.URLUpdate(siteId=batchItem.urlObj.siteId, urlString=batchItem.urlObj.url,
2155 normalizeMask=UrlNormalizator.NORM_NONE)
2156 urlUpdateObj.urlMd5 = batchItem.urlObj.urlMd5
2157 urlUpdateObj.batchId = batchId
2159 urlUpdateObj.processed = 0
2160 urlUpdateObj.status = dc_event.URL.STATUS_CRAWLED
2161 urlUpdateObj.size = 0
2162 urlUpdateObj.contentType =
"" 2163 result = self.
dbWrapper.urlUpdate(urlUpdateObj)
2164 self.
logger.debug(
"urlUpdate() return result: " + str(result))
2166 batchItem.urlObj.url = newUrls[0]
2167 batchItem.urlObj.parentMd5 = batchItem.urlObj.urlMd5
2168 if urlSchema.externalError != APP_CONSTS.ERROR_OK:
2169 batchItem.urlObj.errorMask |= urlSchema.externalError
2170 batchItem.urlId = batchItem.urlObj.urlMd5
2173 result = self.
dbWrapper.urlNew([batchItem.urlObj])
2174 self.
logger.debug(
"urlNew() return result: " + str(result))
2176 if len(newUrls) > 1:
2177 for newUrl
in newUrls[1:]:
2178 localBatchItem = copy.deepcopy(batchItem)
2179 localBatchItem.urlObj.batchId = 0
2180 localBatchItem.urlObj.status = dc_event.URL.STATUS_NEW
2181 localBatchItem.urlObj.url = newUrl
2182 localBatchItem.urlObj.urlMd5 = hashlib.md5(newUrl).hexdigest()
2183 localBatchItem.urlObj.parentMd5 = batchItem.urlObj.urlMd5
2184 localBatchItem.urlId = localBatchItem.urlObj.urlMd5
2185 localBatchItem.urlObj.CDate = str(datetime.datetime.now())
2186 localBatchItem.urlObj.errorMask = 0
2187 localBatchItem.urlObj.tagsCount = 0
2188 localBatchItem.urlObj.tagsMask = 0
2189 localBatchItem.urlObj.crawled = 0
2190 localBatchItem.urlObj.processed = 0
2191 localBatchItem.urlObj.size = 0
2192 localBatchItem.urlObj.contentType =
"" 2193 localBatchItem.urlObj.rawContentMd5 =
"" 2194 localBatchItem.urlObj.state = dc_event.URL.STATE_ENABLED
2196 if urlSchema.externalError != APP_CONSTS.ERROR_OK:
2197 localBatchItem.urlObj.errorMask |= urlSchema.externalError
2203 result = self.
dbWrapper.urlNew([elem.urlObj])
2204 self.
logger.debug(
"urlNew() for urls list return result: " + str(result))
2205 if int(result) == 0:
2206 urlUpdateObj = dc_event.URLUpdate(siteId=elem.urlObj.siteId, urlString=elem.urlObj.url,
2207 normalizeMask=UrlNormalizator.NORM_NONE)
2208 urlUpdateObj.urlMd5 = elem.urlObj.urlMd5
2209 urlUpdateObj.parentMd5 = batchItem.urlObj.urlMd5
2210 urlUpdateObj.batchId = 0
2211 urlUpdateObj.status = dc_event.URL.STATUS_NEW
2213 urlUpdateObj.errorMask = 0
2214 urlUpdateObj.tagsCount = 0
2215 urlUpdateObj.tagsMask = 0
2217 urlUpdateObj.processed = 0
2218 urlUpdateObj.size = 0
2219 urlUpdateObj.contentType =
"" 2220 urlUpdateObj.rawContentMd5 =
"" 2221 urlUpdateObj.state = dc_event.URL.STATE_ENABLED
2223 result = self.
dbWrapper.urlUpdate(urlUpdateObj)
2224 self.
logger.debug(
"urlUpdate() for urls list return result: " + str(result))
2227 if urlSchema.batchInsert == UrlSchema.BATCH_INSERT_ALL_NEW_ITEMS:
2228 self.
logger.debug(
"UrlSchema use 'batch_insert' as 'BATCH_INSERT_ALL_NEW_ITEMS'")
2229 elif urlSchema.batchInsert == UrlSchema.BATCH_INSERT_ONLY_FIRST_ITEM:
2230 self.
logger.debug(
"UrlSchema use 'batch_insert' as 'BATCH_INSERT_ONLY_FIRST_ITEM'")
2233 elif urlSchema.batchInsert == UrlSchema.BATCH_INSERT_NO_ONE_ITEMS:
2234 self.
logger.debug(
"UrlSchema use 'batch_insert' as 'BATCH_INSERT_NO_ONE_ITEMS'")
2237 self.
logger.
error(
"UrlSchema use 'batch_insert' an unsupported value: " + str(urlSchema.batchInsert))
2247 if urlObj
is not None:
2249 self.
logger.debug(
"Set error_mask: %s", str(urlObj.errorMask))
2255 self.
crawledResource.content_type != dc_event.URL.CONTENT_TYPE_UNDEFINED:
2264 if batchItem
is not None:
2269 self.
logger.debug(
"len(self.feedItems): %s", str(len(self.
feedItems)))
2271 elif self.
url is not None:
2272 self.
logger.debug(
"Before: batchItem urlObj errorMask: %s, url ErrorMask: %s" % (batchItem.urlObj.errorMask,
2273 self.
url.errorMask))
2274 batchItem.urlObj.errorMask |= self.
url.errorMask
2275 batchItem.urlObj.errorMask |= self.
errorMask 2276 self.
logger.debug(
"After: batchItem urlObj errorMask: %s, url ErrorMask: %s" % (batchItem.urlObj.errorMask,
2277 self.
url.errorMask))
2278 if isinstance(batchItem, types.ListType):
2283 self.
logger.debug(
"self.feedItems: %s, self.store_items: %s", str(len(self.
feedItems)),
2286 self.
logger.debug(
">>> wrong !!! updateBatchItem, batchItem is None")
2292 self.
logger.debug(
"!!! createBatchItemsFromFeedItems() enter ... self.crawledResource: " + \
2296 if self.
batch.maxItems > len(items):
2297 urlMd5 = elem[
"urlMd5"]
2298 self.
logger.debug(
"URLMD5: %s" % str(urlMd5))
2299 self.
logger.debug(
"value: %s" % str(elem))
2301 urlObj = elem[
"urlObj"]
2304 elem.pop(
"urlObj",
None)
2307 dates = [
"published_parsed",
"updated_parsed"]
2309 if date
in elem[
"entry"]
and elem[
"entry"][date]
is not None:
2310 elem[
"entry"][date] = strftime(
"%a, %d %b %Y %H:%M:%S +0000", elem[
"entry"][date])
2311 elem[
"urlMd5"] = urlMd5
2312 elem[
"entry"] = dict(elem[
"entry"])
2314 saveBatchItemUrlId = self.
batchItem.urlId
2321 self.
batchItem.urlId = saveBatchItemUrlId
2322 except Exception, err:
2323 ExceptionLog.handler(self.
logger, err,
"Can't save object on disk. Reason:", (), \
2324 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
2325 self.
batchItem.urlId = saveBatchItemUrlId
2328 urlObj.contentMask = dc_event.URL.CONTENT_STORED_ON_DISK
2329 if parentBatchItem
is not None:
2330 batchItem = copy.deepcopy(parentBatchItem)
2331 batchItem.siteId = siteId
2332 batchItem.urlId = urlMd5
2333 batchItem.urlObj = urlObj
2335 batchItem = dc_event.BatchItem(siteId, urlMd5, urlObj)
2336 batchItem.urlObj.urlPut =
None 2337 items.append(batchItem)
2347 self.
config.optionxform = str
2348 if self.pargs.config:
2349 self.
config.read(self.pargs.config)
2351 print MSG_ERROR_LOAD_CONFIG
2360 className = self.__class__.__name__
2362 except Exception, err:
2363 ExceptionLog.handler(self.
logger, err,
"Error load KVDB config option: %s", self.
DB_DATA_DIR)
2372 log_conf_file = self.
config.get(
"Application",
"log")
2373 logging.config.fileConfig(log_conf_file)
2377 print MSG_ERROR_LOAD_LOG_CONFIG_FILE
2385 if propertyStr
is not None and propertyStr !=
'':
2387 propertyObj = json.loads(propertyStr)
2388 if 'suffix' in propertyObj:
2389 suffix = propertyObj[
'suffix'].replace(
'%PROJECT_ID%', projectId)
2390 self.
logger = mpLogger.getLogger(fileNameSuffix=suffix)
2392 self.
logger.debug(
"Suffix field not found for project %s in property: %s", str(projectId), str(propertyObj))
2393 except Exception, err:
2394 self.
logger.
error(
"Error set project-specific logger: %s", str(err))
2396 self.
logger.debug(
"Wrong or empty file name suffix, project %s logger not set: %s", str(projectId), \
2405 self.
logger = mpLogger.getLogger(restore=
True)
2406 except Exception, err:
2407 self.
logger.
error(
"Error set default logger: %s", str(err))
2421 dbTaskIni = self.
config.get(self.__class__.__name__,
"db-task_ini")
2427 if self.
config.has_option(self.__class__.__name__,
"useZeroSiteIdSiteNotExists"):
2430 bool(int(self.
config.get(self.__class__.__name__,
"useZeroSiteIdSiteNotExists")))
2441 if self.
config.has_option(self.__class__.__name__,
"keep_old_resources"):
2448 except Exception, err:
2449 ExceptionLog.handler(self.
logger, err,
"Error load config options:")
2450 raise Exception(
'CRAWLER FATAL INITIALIZATION INI ERROR: ' + str(err))
2456 self.
logger.debug(
"Requests response history: %s", str(self.
res.redirects))
2458 if self.
res.redirects
and HTTP_REDIRECT
in self.
res.redirects:
2466 self.
logger.debug(
"http redirect limit was reached! Max http redirects: %s, encountered http redirects: %s." %
2475 if resource.http_code == CONSTS.HTTP_CODE_304:
2476 resource.last_modified = self.
url.tcDate
2477 elif 'Last-Modified' in res.headers:
2478 resource.last_modified = res.headers[
'Last-Modified']
2479 resource.last_modified = parse(resource.last_modified).strftime(
'%Y-%m-%d %H:%M:%S')
2480 elif 'Date' in res.headers:
2481 resource.last_modified = res.headers[
'Date']
2482 resource.last_modified = parse(resource.last_modified).strftime(
'%Y-%m-%d %H:%M:%S')
2484 resource.last_modified = time.strftime(
'%Y-%m-%d %H:%M:%S', time.gmtime(time.time() - self.
defaultIcrCrawlTime))
2485 self.
logger.debug(
"LastModified date: %s", str(resource.last_modified))
2487 return str(resource.last_modified)
2500 if urlObjects
is not None:
2501 if siteObject
is None:
2502 siteObjectLocal = self.
batchItem.siteObj
2505 siteObjectLocal = siteObject
2506 siteIdLocal = siteObject.id
2508 for urlObject
in urlObjects:
2509 self.
logger.debug(
"Create new batch item, URLMd5: %s, siteId: %s", urlObject.urlMd5, siteIdLocal)
2510 batchItem = dc_event.BatchItem(siteId=siteIdLocal, urlId=urlObject.urlMd5, urlObj=urlObject,
2511 siteObj=siteObjectLocal)
2512 batchItem.properties = self.
batchItem.properties
2522 for srcBatchItem
in sourceBatchItems:
2524 if lookIncomingBatch:
2525 for dstBatchItem
in self.
batch.items:
2526 if srcBatchItem.siteId == dstBatchItem.siteId
and srcBatchItem.urlId == dstBatchItem.urlId:
2527 self.
logger.debug(
"batchItemsExtendUnique baseItems duplicate " + srcBatchItem.urlId +
" " +
2532 for dstBatchItem
in destinationBatchItems:
2533 if srcBatchItem.siteId == dstBatchItem.siteId
and srcBatchItem.urlId == dstBatchItem.urlId:
2534 self.
logger.debug(
"batchItemsExtendUnique duplicate " + srcBatchItem.urlId +
" " + dstBatchItem.urlId)
2538 self.
logger.debug(
"batchItemsExtendUnique added, urlId: %s", srcBatchItem.urlId)
2539 srcBatchItem.urlObj.type = urlType
2541 if self.
batch.maxItems
is not None and int(self.
batch.maxItems) <= len(destinationBatchItems):
2542 destinationBatchItems = destinationBatchItems[0: self.
batch.maxItems]
2543 destinationBatchItems[-1].urlObj.errorMask |= APP_CONSTS.ERROR_MAX_ITEMS
2544 self.
logger.debug(
"Set ErrorMask^ %s", str(destinationBatchItems[-1].urlObj.errorMask))
2546 destinationBatchItems.append(srcBatchItem)
2559 if batchItem.urlObj.chainId
is None and "URL_CHAIN" in self.
siteProperties and \
2569 if urlObjects
is not None and len(urlObjects) > 0:
2570 if batchItem.urlObj.chainId
not in self.
chainDict:
2571 self.
chainDict[batchItem.urlObj.chainId] = {}
2572 self.
chainDict[batchItem.urlObj.chainId][
"batchItem"] = batchItem
2573 self.
chainDict[batchItem.urlObj.chainId][
"chainUrlMD5List"] = []
2574 for urlObj
in urlObjects:
2575 urlObj.chainId = batchItem.urlObj.chainId
2583 if batchItem.urlObj.chainId
is not None and batchItem.urlObj.chainId
in self.
chainDict:
2584 localChainElem = self.
chainDict[batchItem.urlObj.chainId]
2585 if batchItem.urlObj.urlMd5 != localChainElem[
"batchItem"].urlObj.urlMd5
and \
2586 batchItem.urlObj.urlMd5
not in localChainElem[
"chainUrlMD5List"]:
2587 localChainElem[
"chainUrlMD5List"].append(batchItem.urlObj.urlMd5)
2600 if logger
is not None:
2601 logger.debug(
'isHostAvailable url: ' + str(url) +
', parameters: ' + str(parameters))
2603 if not isinstance(url, basestring)
or url ==
"":
2604 raise Exception(
"Bad parameter 'url'")
2606 if 'method' in parameters
and int(parameters[
'method']) == 0:
2608 pr = urlparse.urlparse(url)
2610 pr = pr.netloc.split(
':')
2617 if 'domain_name_resolve' in parameters
and int(parameters[
'domain_name_resolve']) == 1:
2619 ai = socket.getaddrinfo(host, port, 0, 0, socket.IPPROTO_TCP)
2621 if 'connect_resolve' in parameters
and int(parameters[
'connect_resolve']) == 1:
2622 if 'connection_timeout' in parameters
and float(parameters[
'connection_timeout']) > 0:
2623 timeout = float(parameters[
'connection_timeout'])
2625 af, socktype, proto, canonname, sa = item
2626 s = socket.socket(af, socktype, proto)
2627 s.settimeout(float(timeout))
2630 except Exception, err:
2633 if logger
is not None:
2634 logger.debug(
"Host %s, timeout %f connect check error: %s", str(sa), str(timeout), str(err))
2641 except Exception, err:
2644 if logger
is not None:
2645 logger.debug(
"Host %s availability check error: %s", str(url), str(err))
2656 localChainElem = self.
chainDict[localChainKay]
2657 saveBuf =
'\n'.
join(localChainElem[
"chainUrlMD5List"])
2658 saveBuf = saveBuf.strip()
2659 putDict = {
"data": base64.b64encode(saveBuf)}
2660 urlPutObj = dc_event.URLPut(localChainElem[
"batchItem"].siteId, localChainElem[
"batchItem"].urlObj.urlMd5, \
2661 dc_event.Content.CONTENT_CHAIN_PARTS, putDict)
2662 urlPutList.append(urlPutObj)
2663 if len(urlPutList) > 0:
2664 self.
dbWrapper.putURLContent([urlPutObj])
2666 self.
urlProcess.siteId = localChainElem[
"batchItem"].siteId
2667 self.
urlProcess.updateURLStatus(localChainElem[
"batchItem"].urlId)
2680 if siteProperties
is not None and crawledResource
is not None:
2682 pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
2683 pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
2686 if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME
in siteProperties:
2687 pdateSourceMask = int(siteProperties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
2690 if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME
in siteProperties:
2691 pdateSourceMaskOverwrite = int(siteProperties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
2693 self.
logger.debug(
'pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
2694 str(pdateSourceMask), str(pdateSourceMaskOverwrite))
2696 self.
logger.debug(
'crawledResource.last_modified = ' + str(crawledResource.last_modified))
2699 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_URL_NAME:
2700 if pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_URL_NAME:
2706 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
2707 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED
and ret
is None)
or \
2708 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
2710 dt = DateTimeType.parse(urlObj.pDate,
True, self.
logger,
False)
2712 ret = dt.strftime(
"%Y-%m-%d %H:%M:%S")
2715 self.
logger.debug(
"Unsupported date format: <%s>", Utils.varDump(urlObj.pDate))
2717 self.
logger.debug(
'pubdate from rss feed: ' + str(ret))
2721 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_HTTP_DATE
and 'date' in crawledResource.response_header:
2722 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_HTTP_DATE
and ret
is None)
or \
2723 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_HTTP_DATE:
2725 dt = DateTimeType.parse(value,
True, self.
logger)
2727 ret = dt.strftime(
'%Y-%m-%d %H:%M:%S')
2728 self.
logger.debug(
'pubdate from http header: ' + str(ret))
2731 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_HTTP_LAST_MODIFIED:
2732 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_HTTP_LAST_MODIFIED
and ret
is None)
or \
2733 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_HTTP_LAST_MODIFIED:
2735 if 'last-modified' in crawledResource.response_header:
2737 d = DateTimeType.parse(value,
True, self.
logger)
2739 d = DateTimeType.parse(crawledResource.last_modified,
True, self.
logger)
2742 ret = d.strftime(
'%Y-%m-%d %H:%M:%S')
2743 self.
logger.debug(
'pubdate from last modified: ' + str(ret))
2745 except Exception, err:
2759 if isinstance(responseHeader, str)
or isinstance(responseHeader, unicode):
2760 responseHeader = responseHeader.split(
'\r\n')
2763 for elem
in responseHeader:
2764 begPos = elem.find(name)
2765 endPos = elem.find(
':')
2766 if begPos > -1
and endPos > -1:
2767 foundName = elem[begPos:endPos].strip()
2768 self.
logger.debug(
'foundName: %s', str(foundName))
2769 if foundName == name:
2770 ret = elem[endPos + 1:].strip()
2771 self.
logger.debug(
"value extracted from field '%s': %s", name, str(ret))
2781 if crawledResource
is not None and pubdateRssFeed
is not None:
2782 self.
crawledResource.response_header = (crawledResource.response_header +
'\r\n' +
2783 CONSTS.pubdateRssFeedHeaderName +
': ' + str(pubdateRssFeed +
'+0000'))
2790 if crawledResource
is not None and feedUrl
is not None:
2791 self.
crawledResource.response_header = (crawledResource.response_header +
'\r\n' +
2792 CONSTS.rssFeedUrlHeaderName +
': ' + str(feedUrl))
2799 if crawledResource
is not None and baseUrl
is not None:
2800 self.
crawledResource.response_header = (crawledResource.response_header +
'\r\n' +
2801 CONSTS.baseUrlHeaderName +
': ' + str(baseUrl))
2814 if isinstance(htmlContent, basestring):
2816 if len(urlsList) > 0:
2819 except Exception, err:
2820 self.
logger.
error(MSG_ERROR_EXTRACT_BASE_URL, str(err))
2834 if headers
is not None:
2835 headers = RequestsRedirectWrapper.updateHeadersByCookies(headers, url, self.
cookieResolver, stage)
2850 if propertyName
in siteProperties:
2852 if logger
is not None:
2853 logger.debug(
"Property '%s' found in site properties", str(propertyName))
2855 parameters = json.loads(siteProperties[propertyName])
2856 if logger
is not None:
2857 logger.debug(
"Property '%s' successfully got from json", str(propertyName))
2859 ret = CrawlerTask.isHostAvailable(url, parameters, logger)
2860 except Exception, err:
2861 if logger
is not None:
2862 logger.error(
"Try getting '%s' was fail: %s", str(propertyName), str(err))
2875 return CrawlerTask.hostAliveHandler(propertyName=CrawlerTask.HOST_ALIVE_CHECK_NAME,
2876 siteProperties=siteProperties,
2889 return CrawlerTask.hostAliveHandler(propertyName=CrawlerTask.HOST_ALIVE_CHECK_PROXY_NAME,
2890 siteProperties=siteProperties,
2891 url=CrawlerTask.DEFAULT_PROTOCOL_PREFIX + proxyName,
2909 dbProxyWrapper =
None 2910 if HTTPProxyResolver.USER_PROXY_PROPERTY_NAME
in siteProperties
and dbWrapper
is not None:
2913 for triesCount
in xrange(HTTPProxyResolver.getTriesCount(siteProperties)):
2914 proxyName = HTTPProxyResolver.getProxy(siteProperties=siteProperties,
2917 dbProxyWrapper=dbProxyWrapper)
2920 if proxyName
is not None and not CrawlerTask.isAvailableProxy(siteProperties=siteProperties,
2921 proxyName=proxyName,
2924 logger.debug(
"Tries count = %s. Proxy: '%s' is not available!!!", str(triesCount), str(proxyName))
2925 HTTPProxyResolver.addFaults(siteProperties=siteProperties,
2927 proxyName=proxyName,
2928 dbProxyWrapper=dbProxyWrapper)
2930 logger.debug(
"Tries count = %s. Proxy: '%s' is available!!!", str(triesCount), str(proxyName))
2946 dbProxyWrapper =
None 2947 if HTTPProxyResolver.USER_PROXY_PROPERTY_NAME
in siteProperties
and dbWrapper
is not None:
2951 if proxyName
is not None:
2952 HTTPProxyResolver.addFaults(siteProperties=siteProperties,
2954 proxyName=proxyName,
2955 dbProxyWrapper=dbProxyWrapper)
2972 dbProxyWrapper =
None 2973 if HTTPProxyResolver.USER_PROXY_PROPERTY_NAME
in siteProperties
and dbWrapper
is not None:
2976 if proxyName
is not None:
2977 ret = HTTPProxyResolver.isNeedRotateProxy(siteProperties=siteProperties,
2979 proxyName=proxyName,
2980 dbProxyWrapper=dbProxyWrapper,
2981 rawContent=rawContent)
2993 self.
logger.debug(
"Signal %s - timer trapped!", str(signum))
3010 dbTasksWrapper =
None 3012 if configName ==
"":
3013 raise Exception(MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
3015 config = ConfigParser.ConfigParser()
3016 config.optionxform = str
3018 readOk = config.read(configName)
3020 if len(readOk) == 0:
3021 raise Exception(MSG_ERROR_WRONG_CONFIG_FILE_NAME % configName)
3025 except Exception, err:
3026 raise Exception(MSG_ERROR_LOAD_APP_CONFIG % str(err))
3028 return dbTasksWrapper
def updateURL(input_url, site)
def signalHandlerTimer(self, signum, frame)
def batchItemsExtendUnique(self, destinationBatchItems, sourceBatchItems, lookIncomingBatch=True, urlType=1)
statusUpdateEmptyProxyList
def commonChecks(self, urlObj)
def addProxyFaults(siteProperties, siteId, proxyName, dbWrapper)
statusUpdateNoAvailableProxy
def updateHeadersByCookies(self, headers, url, stage)
def createBatchItemsFromFeedItems(self, parentBatchItem)
def readSmallFileContent(path)
def calcLastModified(self, resource, res)
def fillChainUrlMD5List(self, batchItem)
def updateSiteParams(self, mask, is_suspend=False)
def isAvailableUrl(siteProperties, url, logger=None)
def addBaseUrlToHeader(self, crawledResource, baseUrl)
def updateCollectedURLs(self)
def loadSite(self, batch)
def processRotatedHeaders(self, url)
def getPubdateUseSourceMask(self, siteProperties, crawledResource, urlObj)
def changeBatchItemByUrlSchema(self, batchItem, batchId)
def saveChainStorageData(self)
def readSiteProperties(self)
def loadLogConfigFile(self)
string URLS_XPATH_LIST_FILE
def makeDefaultResponse(self, response, httpCode=CONSTS.HTTP_CODE_400)
def httpRequestWrapper(self, url, headers, auth, postData, urlObj, incomingContent, macroCode=None, proxyName=None)
def fetchFileHeader(self, url, siteId)
def generateBatchitemsByURLSchema(self, batch)
def fillItemsFromIterations(self, urlObjects=None, siteObject=None, reset=True)
def updateBatchItem(self, batchItem)
def isHostAvailable(url, parameters, logger=None, timeout=0.5)
def addPubdateRssFeedToHeader(self, crawledResource, pubdateRssFeed)
dictionary errorMaskHttpCodeDict
def getProxyName(siteProperties, siteId, url, dbWrapper, logger)
def extractBaseUrl(self, htmlContent, default)
def isRootURL(self, urlObj, urlString=None)
def isNeedRotateProxy(siteProperties, siteId, proxyName, dbWrapper, rawContent)
def crawl(self, incomingContent)
def updateUrlObjInBatchItem(self, urlObj)
useZeroSiteIdSiteNotExists
def setLogConfigFileProject(self, mpLogger, projectId, propertyStr)
def isAvailableProxy(siteProperties, proxyName, logger=None)
urlTempalteRealtimeEncode
string SEARCH_BASE_URL_PATTERN
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def saveFileHeader(self, url, siteId, localFileHeaders)
def hostAliveHandler(propertyName, siteProperties, url, logger=None)
def updateBatchItemAfterCarwling(self, status=dc_event.URL.STATUS_CRAWLED)
def fillItemsFromIterationsWithChain(self, urlObjects, batchItem)
def initHTTPHeaders(self)
def updateURLForFailed(self, errorBit, httpCode=CONSTS.HTTP_CODE_400, status=dc_event.URL.STATUS_CRAWLED, updateUdate=True)
def setLogConfigFileDefault(self, mpLogger)
def extractValueFromHeader(self, responseHeader, name)
def processBatchItem(self)
def __createDBTasksWrapper(self, configName)
def strToProxy(proxyString, log=None, defaultProxyType='http')
def addFeedUrlToHeader(self, crawledResource, feedUrl)
def setChainId(self, batchItem)