4 @author Scorp <developers.hce@gmail.com> 5 @link: http://hierarchical-cluster-engine.com/ 6 @copyright: Copyright © 2013-2014 IOIX Ukraine 7 @license: http://hierarchical-cluster-engine.com/license/ 40 BINARY_CONTENT_TYPE_PATTERN = re.compile(
'(text)|(xml)', re.I)
41 COLLECT_POST_DATA_NAME =
"COLLECT_POST_DATA" 42 COLLECT_POST_DATA =
"1" 43 DC_URLS_TABLE_PREFIX =
"urls_" 44 PATTERN_WITH_PROTOCOL = re.compile(
'[a-zA-Z]+:(//)?')
45 DETECT_MIME_MAIN_CONTENT =
"1" 46 DETECT_MIME_COLLECTED_URL =
"2" 47 DETECT_MIME_TIMEOUT = 1
71 self.
isAbortedByTTL = (
lambda:
False)
if isAbortedByTTL
is None else isAbortedByTTL
77 excludeList = [
'feedItems',
'feed',
'processorName',
'autoDetectMime',
'processContentTypes',
78 'postForms',
'robotsParser',
'dom',
'dbWrapper']
79 for field
in self.__dict__:
80 if field
not in excludeList
and (
not hasattr(self, field)
or getattr(self, field)
is None):
81 msg =
"Mandatory field must be initialized, field Name = " + field
92 def process(self, httpCode, readOnly=False, httpApplyHeaders=None, proxyName=None):
102 internalLinks, externalLinks = [], []
111 logger.debug(
"!!! self.site.maxURLsFromPage = " + str(self.
site.maxURLsFromPage))
112 logger.debug(
"!!! self.url.maxURLsFromPage = " + str(self.
url.maxURLsFromPage))
114 if self.
site is not None and self.
site.maxURLsFromPage
is not None:
115 maxURLsFromPage = self.
site.maxURLsFromPage
117 if self.
url is not None and self.
url.maxURLsFromPage
is not None and self.
url.maxURLsFromPage > 0:
118 maxURLsFromPage = self.
url.maxURLsFromPage
127 if code_type == 4
or code_type == 5:
139 if self.
dom is not None:
142 urlSet.update(formUrls)
144 logger.debug(
"DOM is None")
146 if self.
url.type == dc.EventObjects.URL.TYPE_SINGLE:
147 logger.debug(
"URL type: single")
151 SeleniumFetcher.MACRO_RESULT_TYPE_URLS_LIST:
155 except Exception, err:
156 logger.error(
"Error deserialize macro data from result string: %s\n%s", str(err),
159 logger.debug(
"Fill urlSet from macro results: %s items", str(len(ul)))
160 if isinstance(ul, list):
161 urlSet.update([u
for u
in ul
if isinstance(u, basestring)
and u !=
''])
164 if self.
url.type == dc.EventObjects.URL.TYPE_CHAIN:
165 logger.debug(
"URL type: chain")
177 params, maxURLsFromPage,
True)
179 logger.debug(
">>> Wrong \"RSS_FEED_ZERO_ITEM\" property's value")
181 logger.debug(
"URLs candidates collected %s items:\n%s", str(len(urlSet)), str(urlSet))
183 if self.
site.maxURLs > 0
and len(urlSet) >= self.
site.maxURLs:
184 urlSet = set(list(urlSet)[:self.
site.maxURLs])
185 logger.debug(
"Site maxURLs = %s limit reached.", str(self.
site.maxURLs))
187 if self.
site.maxResources > 0
and len(urlSet) >= self.
site.maxResources:
188 urlSet = set(list(urlSet)[:self.
site.maxResources])
189 logger.debug(
"Site maxResources = %s limit reached.", str(self.
site.maxResources))
193 for elemUrl
in urlSet:
196 logger.debug(
"Aborted by TTL. All elements skipped.")
200 logger.debug(
"Some url from urlSet is None, skipped.")
203 elemUrl = elemUrl.strip()
205 logger.debug(
"Some url from urlSet is empty, skipped!")
215 if retUrl
is not None:
217 elemUrl = UrlNormalize.execute(siteProperties=self.
siteProperties, base=self.
baseUrl, url=elemUrl, supportProtocols=
None, log=logger)
222 logger.debug(
"Candidate URL is not passed general checks, skipped: %s", str(elemUrl))
228 depth=self.
url.depth,
231 logger.debug(
"Candidate URL not matched filters, skipped.")
234 logger.debug(
"Candidate URL matched filters.")
237 urlMd5 = hashlib.md5(elemUrl).hexdigest()
242 logger.debug(
"Candidate URL %s already exist, skipped.", str(urlMd5))
245 if self.
site.maxURLs > 0:
246 if httpCode == CRAWLER_CONSTS.HTTP_CODE_200:
253 if currentCnt >= self.
site.maxURLs
or countCnt >= self.
site.maxURLs
or \
254 (countCnt + countErrors) >= self.
site.maxURLs:
255 logger.debug(
"Site MaxURLs: %s limit is reached. countCnt = %s, currentCnt = %s",
256 str(self.
site.maxURLs), str(countCnt), str(currentCnt))
259 if autoremovedURLs == 0:
260 logger.debug(
"No one URL auto removed, candidate URL skipped!")
263 logger.debug(
"%s URLs auto removed.", str(autoremovedURLs))
265 if currentCnt >= self.
site.maxResources
or countCnt >= self.
site.maxResources
or \
266 (countCnt + countErrors) >= self.
site.maxResources:
267 logger.debug(
"Site maxResources = %s limit is reached. countCnt = %s, currentCnt = %s",
268 str(self.
site.maxResources), str(countCnt), str(currentCnt))
271 if autoremovedURLs == 0:
272 logger.debug(
"No one URL auto removed, candidate URL skipped!")
275 logger.debug(
"%s URLs auto removed.", str(autoremovedURLs))
285 logger.debug(
"Candidate URL MIME type is not matched, skipped!")
290 logger.debug(
"Robots.txt obey mode is ON")
296 logger.debug(
"URL " + elemUrl +
" is NOT Allowed by user-agent:" + str(retUserAgent))
305 logger.debug(
'!!!!!! HTTP_REDIRECT_RESOLVER !!!!! ')
309 connectionTimeout = float(self.
siteProperties[
"CONNECTION_TIMEOUT"])
311 connectionTimeout = CRAWLER_CONSTS.CONNECTION_TIMEOUT
313 tm = int(self.
url.httpTimeout) / 1000.0
314 if isinstance(self.
url.httpTimeout, float):
315 tm += float(
'0' + str(self.
url.httpTimeout).strip()[str(self.
url.httpTimeout).strip().find(
'.'):])
317 proxies = {
"http":
"http://" + proxyName}
if proxyName
is not None else None 323 if authName
is not None and authPwd
is not None:
324 auth = (authName, authPwd)
328 if key.startswith(
'HTTP_POST_FORM_'):
329 postForms[key[len(
'HTTP_POST_FORM_'):]] = self.
siteProperties[key]
330 postData = self.
urlProcess.resolveHTTP(postForms, httpApplyHeaders)
332 maxRedirects = HTTPRedirectResolver.RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS
337 fetchType=self.
site.fetchType,
340 connectionTimeout=connectionTimeout)
342 resUrl = redirectResolver.resolveRedirectUrl(url=elemUrl,
343 headers=httpApplyHeaders,
349 maxRedirects=maxRedirects,
350 filters=self.
site.filters)
352 logger.debug(
"Resolved url: %s", str(resUrl))
355 if elemUrl
is not None:
359 localUrlObj = self.
urlProcess.createUrlObjForCollectURLs(urlMd5, formMethods, self.
batchItem.urlId, depth,
360 detectedMime, self.
site.maxURLsFromPage)
362 localUrlObj.linksI = len(internalLinks)
363 localUrlObj.linksE = len(externalLinks)
369 params.append(localUrlObj)
374 if "url_pattern" in localChainDict:
375 for elemUrl
in urlSet:
377 logger.debug(
"Some url from urlSet is None")
385 logger.debug(
"Bad url normalization, url: %s", retUrl)
395 urlMd5 = hashlib.md5(elemUrl).hexdigest()
401 createUrlObjForChain(localChainDict[
"url_pattern"], urlMd5, formMethods,
402 self.
batchItem.urlId, depth, detectedMime, self.
site.maxURLsFromPage)
403 if localUrlObj
is not None:
404 chainUrls.append(copy.deepcopy(localUrlObj))
405 except Exception
as excp:
406 logger.error(
"Error in URL_CHAIN deserialize, excp = " + str(excp))
407 if len(urlSet) > 0
and len(params) == 0:
408 logger.debug(
"Zero urls are collected for len(urlSet): %s", str(len(urlSet)))
409 elif len(params) > 0:
410 logger.debug(
"Collected and send to insert as new: %s", str(len(urlSet)))
416 self.
urlProcess.updateTypeForURLObjects(chainUrls)
417 self.
dbWrapper.collectedURLsRecalculating(localSiteId)
419 if formFields
is not None and self.
postForms is not None and self.
dbWrapper is not None:
426 return nextStep, internalLinks, externalLinks, params, self.
feedItems, chainUrls
441 maxURLsFromPage, rootFeed=False):
443 if maxURLsFromPage > 0
and len(self.
feedItems) >= maxURLsFromPage:
444 logger.debug(
"Site maxURLsFromPage = %s limit reached on %s number.",
445 str(maxURLsFromPage), str(len(self.
feedItems)))
447 if self.
feed is not None:
454 if localRet
is not None:
455 localRet[
"urlMd5"] = urlMd5
456 if localRet[
"urlObj"]
is not None:
457 localRet[
"urlObj"].httpCode = httpCode
458 localRet[
"urlObj"].processingDelay = 0
459 localRet[
"urlObj"].parentMd5 = self.
url.urlMd5
463 params.append(localRet[
"urlObj"])
466 logger.debug(
"self.feed is None!")
476 and urlObj.type != dc.EventObjects.URL.TYPE_FETCHED:
477 if feedparser
is not None:
488 excludes = [
"_start_rss",
"_start_channel",
"_start_feed",
"_start_item",
"_start_link",
489 "_start_admin_errorreportsto",
"_start_admin_generatoragent",
"_start_guid",
"_start_id",
490 "_start_entry",
"_start_enclosure"]
491 for methodName, functionObject
in inspect.getmembers(feedparser._FeedParserMixin, predicate=inspect.ismethod):
492 if methodName.startswith(
"_start_")
and methodName
not in excludes:
493 delattr(feedparser._FeedParserMixin, methodName)
494 endMethodName = methodName.replace(
"_start_",
"_end_")
495 if hasattr(feedparser._FeedParserMixin, endMethodName):
496 delattr(feedparser._FeedParserMixin, endMethodName)
498 feedparser.FeedParserDict.keymap[
"guid"] =
"guid" 499 logger.debug(
"Feedparser in modified mode")
501 logger.debug(
"Feedparser in native mode")
504 urlSet.update(entry.link
for entry
in self.
feed.entries)
507 if len(self.
feed.entries) == 0:
510 logger.debug(
"self.crawledResource.content_type = %s", str(self.
crawledResource.content_type))
511 if self.
crawledResource.content_type == dc.EventObjects.URL.CONTENT_TYPE_TEXT_HTML:
512 urlObj.errorMask |= APP_CONSTS.ERROR_MASK_SITE_UNSUPPORTED_CONTENT_TYPE
514 except TypeError
as err:
515 logger.debug(
"WRONG CONTENT FOR URL <" + str(urlObj.url) +
"> not rss feed. " + str(err.message))
516 except Exception
as err:
517 logger.debug(
"SOME ERROR WITH rss feed parse " + str(err.message))
519 logger.debug(
"feedparser module not found")
523 if len(urlXpathList) > 0:
524 logger.debug(
"Site has COLLECT_URLS_XPATH_LIST property: %s", str(urlXpathList))
528 logger.debug(
"Site has no COLLECT_URLS_XPATH_LIST property, default xpath list used: %s", str(urlXpathList))
529 if 'sets' in urlXpathList
and isinstance(urlXpathList[
'sets'], dict):
531 if 'date_format' in urlXpathList:
532 dformat = str(urlXpathList[
'date_format'])
534 dformat =
'%Y-%m-%d %H:%M:%S' 535 for rexpr
in urlXpathList[
'sets']:
536 if rexpr ==
'' or re.search(rexpr, urlObj.url)
is not None:
537 if 'mode' in urlXpathList
and int(urlXpathList[
'mode']) == 1:
540 xpathl = urlXpathList[
'sets'][rexpr]
544 elem = dom.xpath(xpath)
545 elem_type =
type(elem)
546 if elem_type == list
and len(elem) > 0
and hasattr(elem[0],
"tail"):
547 urlSet.update([el.tail
for el
in elem])
548 elif elem_type == list
and len(elem) > 0
and isinstance(elem[0], lxml.html.HtmlElement):
549 urlSet.update([el.text
for el
in elem])
553 logger.debug(
"Warning! No one xpath set matched URL %s, URLs not collected!", urlObj.url)
555 logger.debug(
'Wrong COLLECT_URLS_XPATH_LIST property, `sets` key with dict() of re->xpath_list[] expected!' + \
556 ' Collect URLs aborted!')
569 d = {
'DATE':
'',
'SHORTYEAR':
'y',
'YEAR':
'Y',
'MONTH':
'm',
'DAY':
'd',
'HOUR':
'H',
'MINUTE':
'M',
'SECOND':
'S'}
570 regex = re.compile(
"%@(SHORTYEAR|YEAR|MONTH|DAY|HOUR|MINUTE|SECOND|DATE)\\(([\\+|\\-]\\d{1,2})\\)%")
571 matchArray = regex.findall(localPattern)
577 t = time.strftime(f, time.gmtime(time.time() + datetime.timedelta(hours=(+int(i[1]))).seconds))
578 localPattern = localPattern.replace(
"%@" + i[0] +
"(" + i[1] +
")%", t)
579 except Exception, err:
580 logger.error(str(err))
592 formUrls, formMethods, formFields = [], {}, {}
595 for form
in dom.xpath(
"//form"):
598 for attr
in form.keys():
599 if attr.lower() ==
"action":
600 formAction = form.get(attr)
601 formUrls.append(formAction)
602 elif attr.lower() ==
"method":
603 formMethod = form.get(attr)
606 formMethods[formAction] = formMethod
607 for field
in form.getchildren():
608 tagName, tagValue =
None,
"" 609 for fieldTag
in field.keys():
610 if fieldTag.lower() ==
"name":
611 tagName = field.get(fieldTag)
612 elif fieldTag.lower() ==
"value":
613 tagValue = field.get(fieldTag)
615 formFields[tagName] = tagValue
616 logger.info(
"extracted form data, formUrls:%s, formMethods:%s, formFields:%s", \
617 formUrls, formMethods, formFields)
618 return formUrls, formMethods, formFields
635 def filtersApply(inputFilters, subject, depth, wrapper, siteId, fields=None, opCode=Filters.OC_RE, \
636 stage=Filters.STAGE_COLLECT_URLS, selectSubject=None, defaultValue=False):
638 fValue = Utils.generateReplacementDict()
639 fValue.update({
"MAX_DEPTH": str(depth)})
641 if inputFilters
is not None:
642 for inputFilter
in inputFilters:
643 if inputFilter.stage == Filters.STAGE_ALL
or inputFilter.stage == Filters.STAGE_REDIRECT_URL:
644 inputFilter.stage = Filters.STAGE_COLLECT_URLS
647 localFilters =
Filters(filters=inputFilters, dbTaskWrapper=wrapper, siteId=siteId, readMode=0, fields=fields,
648 opCode=opCode, stage=stage, selectSubject=selectSubject)
651 fResult = localFilters.filterAll(stage, fValue, Filters.LOGIC_OR, subject, 1)
652 logger.debug(
">>> filter result include - " + str(fResult))
661 fResult = localFilters.filterAll(stage, fValue, Filters.LOGIC_OR, subject, -1)
662 logger.debug(
">>> filter result exclude - " + str(fResult))
669 logger.debug(
"Verdict: " + str(ret))
680 for fieldName, fieldValue
in formFields.iteritems():
681 if fieldName
in postForms:
683 logger.debug(
"field_name: %s", fieldName)
684 logger.debug(
"field_value: %s", fieldValue)
685 ret.append((siteId,
"HTTP_POST_FORM_" + fieldName, fieldValue))
695 if siteId
is not None and hasattr(params,
'__iter__')
and len(params) > 0:
697 for attr
in localSiteUpdate.__dict__:
698 if hasattr(localSiteUpdate, attr):
699 setattr(localSiteUpdate, attr,
None)
700 localSiteUpdate.updateType = dc.EventObjects.SiteUpdate.UPDATE_TYPE_APPEND
701 localSiteUpdate.id = siteId
702 localSiteUpdate.properties = []
705 newPropElem[
"siteId"] = param[0]
706 newPropElem[
"name"] = param[1]
707 newPropElem[
"value"] = param[2]
708 localSiteUpdate.properties.append(newPropElem)
720 parts = ds.split(
' ')
721 if (
"+" in parts[len(parts) - 1]
or "-" in parts[len(parts) - 1])
and ":" in parts[len(parts) - 1]:
722 parts[len(parts) - 1] = parts[len(parts) - 1].replace(
":",
"")
725 ret = feedparser._parse_date(ds)
731 return (kv[0], kv[1])
def getFieldParams(self, formFields, postForms, siteId)
def insertNewSiteProperties(self, params, wrapper, siteId)
def feedElementsProcessing(self, urlMd5, httpCode, elemUrl, localSiteId, localUrlObj, localUrl, params, maxURLsFromPage, rootFeed=False)
def evaluateDateMacro(self, localPattern, dateFromat)
def __init__(self, isAbortedByTTL=None)
def extractFormURL(self, dom, siteProperties)
def filtersApply(inputFilters, subject, depth, wrapper, siteId, fields=None, opCode=Filters.OC_RE, stage=Filters.STAGE_COLLECT_URLS, selectSubject=None, defaultValue=False)
def _normalize_attributes(self, kv)
BINARY_CONTENT_TYPE_PATTERN
def checkFieldsIsNone(self)
def process(self, httpCode, readOnly=False, httpApplyHeaders=None, proxyName=None)
string DC_URLS_TABLE_PREFIX
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def urlNormalization(base, url, supportProtocols=None, log=None)
string COLLECT_POST_DATA_NAME
string DETECT_MIME_COLLECTED_URL
def processProcessor(self, urlSet, dom, urlXpathList, urlObj)
def feedparserParseDateFixes(self, aDateString)