4 @author Scorp, Vybornyh, bgv <developers.hce@gmail.com> 5 @link: http://hierarchical-cluster-engine.com/ 6 @copyright: Copyright © 2013-2016 IOIX Ukraine 7 @license: http://hierarchical-cluster-engine.com/license/ 24 import CrawlerTask
as CrawlerTask
50 DC_URLS_TABLE_PREFIX =
"urls_" 51 DETECT_MIME_TIMEOUT = 1
52 PATTERN_WITH_PROTOCOL = re.compile(
'[a-zA-Z]+:(//)?')
53 URL_TEMPLATE_CONST =
"%URL%" 54 PROTOCOL_PREFIX =
"://" 55 DEFAULT_PROTOCOLS = [
"http",
"https"]
75 if protocols
is not None:
80 logger.debug(
">>> THAT PROTOCOLS = " + str(self.
protocolsList))
88 if position > 0
and url.find(
'/') == (position + 1):
102 logger.debug(
">>> URL skiped by protocol = " + url)
112 for name
in checkList:
113 if not hasattr(self, name)
or getattr(self, name)
is None:
114 raise Exception(
"Some mandatory field `%s` must be initialized!", name)
132 countsql =
"SELECT COUNT(*) AS cnt FROM `%s` WHERE NOT (Status=4 AND Crawled=0 AND Processed=0)" % \
134 result = self.
dbWrapper.customRequest(countsql, CrawlerTask.DB_URLS)
135 if result
is not None and len(result) > 0
and len(result[0]) > 0:
136 currentCnt = result[0][0]
140 logger.debug(
"!!! maxURLs = %s, currentCnt = %s", str(maxURLs), str(currentCnt))
148 self.
url = urlparse.urljoin(realUrl, self.
url)
157 def processURL(self, realUrl, internalLinks, externalLinks, filtersApply=None, siteFilters=None, baseUrl=None):
162 logger.debug(
"URL: %s", self.
url)
164 if self.
urlObj.type == dc.EventObjects.URL.TYPE_SINGLE
or not self.
url:
165 logger.debug(
"URL type is TYPE_SINGLE - not collect urls. Skip url.")
170 if retUrl
is not None:
172 protocolAllowed =
True 173 if filtersApply
is not None:
175 logger.debug(
">>> Filters() (3.1) siteFilters: " + str(siteFilters))
177 Filters.STAGE_COLLECT_URLS_PROTOCOLS)
179 if localFilters.isExistStage(Filters.STAGE_COLLECT_URLS_PROTOCOLS):
180 resFilterApply = filtersApply(siteFilters, retUrl, 0, self.
dbWrapper, self.
siteId,
181 None, Filters.OC_RE, Filters.STAGE_COLLECT_URLS_PROTOCOLS)
182 logger.debug(
"Filter apply: " + str(resFilterApply))
185 logger.debug(
"checkUrlByProtocol return: " + str(protocolAllowed))
186 logger.debug(
"retUrl: " + str(retUrl))
187 logger.debug(
"realUrl: " + str(realUrl))
190 if Utils.parseHost(retUrl) == Utils.parseHost(realUrl):
191 internalLinks.append(retUrl)
192 logger.debug(
"URL classified as internal")
193 elif Utils.parseHost(retUrl):
194 externalLinks.append(retUrl)
195 logger.debug(
"URL classified as external")
201 logger.debug(
">>> Bad url normalization, url = " + str(retUrl))
203 return retUrl, retContinue
216 ageExpr = self.
siteProperties[
"RECRAWL_URL_AGE_EXPRESSION"].replace(
"%RECRAWL_PERIOD%", str(recrawlPeriod))
218 ageExpr =
"(DATE_ADD(UDate, INTERVAL %s MINUTE)-NOW())" % (str(recrawlPeriod))
219 query =
"SELECT COUNT(*), %s, `Type` FROM `%s` WHERE `URLMd5` = '%s'" % (ageExpr, self.
urlTable, urlMd5)
220 result = self.
dbWrapper.customRequest(query, CrawlerTask.DB_URLS)
221 if result
is not None and len(result) > 0
and len(result[0]) > 0
and result[0][0] > 0:
222 if recrawlPeriod == 0
or result[0][1] > 0
or result[0][2] == dc.EventObjects.URL.TYPE_FETCHED
or \
224 logger.debug(
"URL skipped, exists and re-crawling not active, time not reached or URL Type is " +
225 "RSS feed (not to fetch)\n %s %s", self.
url, urlMd5)
229 logger.debug(
"URL state updated to NEW because re-crawling\n %s %s", self.
url, urlMd5)
232 logger.debug(
"URL %s treated as new\n %s", self.
url, urlMd5)
248 logger.debug(
'!!!!!! updateURLFields !!! self.siteProperties: ' + str(self.
siteProperties))
251 urlUpdateObj.status = int(self.
siteProperties[
"RECRAWL_URL_UPDATE_STATUS"])
255 urlUpdateObj.status =
None 257 urlUpdateObj.status = dc.EventObjects.URL.STATUS_NEW
261 urlUpdateObj.tcDate = self.
siteProperties[
"RECRAWL_URL_UPDATE_TCDATE"]
264 urlUpdateObj.tcDate =
None 270 urlUpdateObj.CDate = self.
siteProperties[
"RECRAWL_URL_UPDATE_CDATE"]
274 urlUpdateObj.UDate = self.
siteProperties[
"RECRAWL_URL_UPDATE_UDATE"]
277 urlUpdateObj.UDate =
None 285 if wrapper
is not None:
286 saveAffectDB = wrapper.affect_db
287 wrapper.affect_db =
True 288 wrapper.urlUpdate(urlUpdateObj,
"`State`=0")
289 wrapper.affect_db = saveAffectDB
299 if dbWrapper
is not None:
300 propertyStruct =
None 302 propertyStruct = json.loads(recrawlUrlUpdateProperty)
303 except Exception, err:
304 logger.error(
"Load property 'RECRAWL_URL_UPDATE' was failed, error: %s", str(err))
307 if propertyStruct
is not None:
310 for pattern, rules
in propertyStruct.items():
311 if re.search(pattern, self.
url)
is not None:
313 if "new" in rules
and int(rules[
"new"]) > 0:
314 saveAffectDB = dbWrapper.affect_db
315 dbWrapper.affect_db =
True 316 dbWrapper.urlNew(self.
urlObj)
317 dbWrapper.affect_db = saveAffectDB
319 if "fields" in rules
and isinstance(rules[
"fields"], dict):
320 for key, value
in rules[
"fields"].
items():
321 if key
in DB_CONSTS.URLTableDict.values():
322 for urlUpdateObjName, DBSchemaName
in DB_CONSTS.URLTableDict.items():
323 if key == DBSchemaName
and hasattr(urlUpdateObj, urlUpdateObjName):
324 setattr(urlUpdateObj, urlUpdateObjName, value)
325 logger.debug(
"For '" + str(DBSchemaName) +
"' found attribute 'UrlUpdate." + \
326 str(urlUpdateObjName) +
"' and set value: " + str(value) + \
327 " type: " + str(
type(value)))
330 logger.debug(
"Wrong DB schema field name '" + str(key) +
"' in property 'RECRAWL_URL_UPDATE'")
332 except Exception, err:
333 logger.error(
"Usage property 'RECRAWL_URL_UPDATE' was failed, error: %s", str(err))
345 ret = res.headers.get(
'content-type',
'').lower()
346 if contentTypeMap
is not None and ret
in contentTypeMap:
347 logger.debug(
">>> Mime type replaced from %s to %s", ret, contentTypeMap[ret])
348 ret = contentTypeMap[ret]
350 logger.warn(
"detect mime type for %s failed", self.
url, exc_info=
True)
363 result = self.
dbWrapper.urlStatus(urlStatusObj,
True)
365 ret = result[0].depth
375 status=dc.EventObjects.URL.STATUS_CRAWLED, updateUdate=True):
378 logger.debug(
"Set errorBit = " + str(errorBit) +
", httpCode = " + str(httpCode))
382 batchItem.urlObj.errorMask = batchItem.urlObj.errorMask | errorBit
383 urlUpdateObj.errorMask =
SQLExpression(
"`ErrorMask` | " + str(errorBit))
385 urlUpdateObj.status = batchItem.urlObj.status = status
386 urlUpdateObj.tcDate = batchItem.urlObj.tcDate =
SQLExpression(
"NOW()")
388 urlUpdateObj.UDate = batchItem.urlObj.UDate =
SQLExpression(
"NOW()")
390 if httpCode
is not None:
391 urlUpdateObj.httpCode = batchItem.urlObj.httpCode = httpCode
392 self.
urlObj.httpCode = httpCode
397 batchItem.urlObj, logger,
398 APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER)
400 for name, value
in changedFieldsDict.items():
401 if hasattr(urlUpdateObj, name):
402 setattr(urlUpdateObj, name, value)
403 urlUpdateObj.errorMask =
SQLExpression(
"`ErrorMask` | " + str(errorBit))
413 if self.
url.startswith(
"http%3A")
or self.
url.startswith(
"https%3A"):
414 ret = urllib.unquote(self.
url.url).
decode(
'utf-8')
427 logger.debug(
"headersDict: %s", str(headersDict))
430 method = self.
urlObj.httpMethod.lower()
435 logger.debug(
"use post, post_data:%s", postData)
448 def updateCrawledURL(self, crawledResource, batchItem, contentSize, status=dc.EventObjects.URL.STATUS_CRAWLED):
452 logger.debug(
">>> Start urls update")
454 updatedCount = self.
urlObj.mRate * self.
urlObj.mRateCounter
455 if crawledResource.http_code != 304:
457 mrate = updatedCount / (self.
urlObj.mRateCounter + 1)
463 urlUpdateObj.contentType = batchItem.urlObj.contentType
464 urlUpdateObj.charset = batchItem.urlObj.charset
465 urlUpdateObj.errorMask = batchItem.urlObj.errorMask
466 urlUpdateObj.crawlingTime = batchItem.urlObj.crawlingTime
467 urlUpdateObj.totalTime = batchItem.urlObj.crawlingTime
468 urlUpdateObj.httpCode = batchItem.urlObj.httpCode
470 urlUpdateObj.status = batchItem.urlObj.status = status
471 urlUpdateObj.size = batchItem.urlObj.size = contentSize
472 urlUpdateObj.mRate = batchItem.urlObj.mRate = mrate
474 batchItem.urlObj.UDate = batchItem.urlObj.tcDate = str(datetime.datetime.now())
475 urlUpdateObj.UDate = urlUpdateObj.tcDate =
SQLExpression(
"NOW()")
476 batchItem.urlObj.mRateCounter += 1
477 urlUpdateObj.mRateCounter =
SQLExpression(
"`MRateCounter` + 1")
478 urlUpdateObj.lastModified = batchItem.urlObj.lastModified = crawledResource.last_modified
479 urlUpdateObj.urlMd5 = batchItem.urlObj.urlMd5
481 if APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER
in self.
siteProperties:
484 batchItem.urlObj, logger,
485 APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER)
487 if changedFieldsDict
is not None:
488 for name, value
in changedFieldsDict.items():
489 if hasattr(urlUpdateObj, name):
490 setattr(urlUpdateObj, name, value)
492 logger.debug(
"!!! Before self.dbWrapper.urlUpdate(urlUpdateObj, \"`Status` = 3\")")
495 updatedRowsCount = self.
dbWrapper.urlUpdate(urlUpdateObj,
"`Status` = 3")
497 logger.debug(
"!!! updatedRowsCount = " + str(updatedRowsCount))
505 def updateURL(self, batchItem, batchId, status=dc.EventObjects.URL.STATUS_CRAWLING):
511 urlUpdateObj.batchId = batchId
512 if not self.
urlObj.httpMethod:
513 urlUpdateObj.httpMethod = batchItem.urlObj.httpMethod =
"get" 515 urlUpdateObj.httpMethod = batchItem.urlObj.httpMethod = self.
urlObj.httpMethod
517 urlUpdateObj.status = batchItem.urlObj.status = status
518 batchItem.urlObj.crawled += 1
520 urlUpdateObj.tcDate = batchItem.urlObj.tcDate =
SQLExpression(
"NOW()")
521 urlUpdateObj.UDate = batchItem.urlObj.UDate =
SQLExpression(
"NOW()")
523 if status == dc.EventObjects.URL.STATUS_CRAWLING:
525 urlUpdateObj.contentType = batchItem.urlObj.contentType = dc.EventObjects.URL.CONTENT_TYPE_UNDEFINED
527 urlUpdateObj.httpCode = batchItem.urlObj.httpCode = 0
529 updatedRowsCount = self.
dbWrapper.urlUpdate(urlUpdateObj)
530 logger.debug(
"!!! updatedRowsCount = " + str(updatedRowsCount))
538 if status
is not None and self.
dbWrapper is not None:
542 urlUpdateObj.status = status
543 updatedRowsCount = self.
dbWrapper.urlUpdate(urlUpdateObj)
544 logger.debug(
"!!! updatedRowsCount = " + str(updatedRowsCount))
555 urlUpdateObj.errorMask = batchItem.urlObj.errorMask = 0
556 urlUpdateObj.tcDate = batchItem.urlObj.tcDate =
SQLExpression(
"NOW()")
557 urlUpdateObj.UDate = batchItem.urlObj.UDate =
SQLExpression(
"NOW()")
576 if result
is not None:
577 maxURLs = result.maxURLs
578 if ((crawlerType != dc.EventObjects.Batch.TYPE_REAL_TIME_CRAWLER)
and \
579 (result.state != dc.EventObjects.Site.STATE_ACTIVE))
or \
580 ((crawlerType == dc.EventObjects.Batch.TYPE_REAL_TIME_CRAWLER)
and \
581 (result.state == dc.EventObjects.Site.STATE_DISABLED)):
582 logger.debug(
"Warning: Batch CrawlerType: %s, site state is %s but not STATE_ACTIVE!", crawlerType,
586 if (result.maxErrors > 0)
and (result.errors > result.maxErrors):
587 msg =
"Site maxErrors limit " + str(result.maxErrors) +
" reached " + str(result.errors)
592 if DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE_ACTIVE
in autoRemoveProps:
593 where = autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE_ACTIVE]
595 where =
"NOT (`Status`=4 AND `Crawled`=0 AND `Processed`=0)" 596 query =
"SELECT COUNT(*) FROM `%s` " % self.
urlTable 597 query +=
"WHERE " + where
598 result = self.
dbWrapper.customRequest(query, CrawlerTask.DB_URLS)
599 if len(result) > 0
and len(result[0]) > 0:
600 activeURLs = result[0][0]
601 logger.debug(
"Active URLs count: " + str(activeURLs) +
", maxURLs: " + str(maxURLs))
602 if (maxURLs > 0)
and (activeURLs >= maxURLs):
603 autoRemoved = URLProcess.autoRemoveURL(autoRemoveProps, recrawlPeriod, self.
urlTable, self.
dbWrapper)
605 msg =
"Active URLs:" + str(activeURLs) +
" > MaxURLs:" + str(maxURLs) +
" and no one auto-removed!" 609 logger.debug(str(autoRemoved) +
" URLs auto-removed to insert new URL from batch")
611 msg =
"Error of query processing, no rows returned:\n" + query
615 batchItem.urlObj.CDate = str(datetime.datetime.now())
616 batchItem.urlObj.UDate = batchItem.urlObj.CDate
617 batchItem.urlObj.tcDate = batchItem.urlObj.CDate
618 batchItem.urlObj.batchId = 0
619 result = self.
dbWrapper.urlNew([batchItem.urlObj])
620 logger.debug(
"rows_count: %s", result)
625 except Exception
as err:
626 logger.debug(
'Error add new url from batch (another host source):' + str(err))
642 if key
in inDict
and inDict[key] !=
'':
662 if wrapper
is not None:
666 if URLProcess.checkDictEmptyStrings(autoRemoveProps, [DC_CONSTS.SITE_PROP_AUTO_REMOVE_RESOURCES,
667 DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE,
668 DC_CONSTS.SITE_PROP_AUTO_REMOVE_ORDER]):
670 query =
"SELECT Site_Id, URLMd5 FROM %s WHERE %s ORDER BY %s LIMIT %s" % \
672 autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE].replace(
"%RecrawlPeriod%", str(recrawlPeriod)),
673 autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_ORDER],
674 autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_RESOURCES])
675 logger.debug(
"SQL to select auto remove candidates: %s", query)
676 result = wrapper.customRequest(query, CrawlerTask.DB_URLS)
682 reason=dc.EventObjects.URLDelete.REASON_CRAWLER_AUTOREMOVE)
683 urlsToDelete.append(urlDelete)
684 logger.debug(
"URL added to auto remove URLMd5:[%s]", row[1])
685 drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.URL_DELETE, urlsToDelete)
686 responseDRCESyncTasksCover = wrapper.process(drceSyncTasksCoverObj)
687 logger.debug(
"Response from db-task module on URLDelete operation:\n%s", \
688 Utils.varDump(responseDRCESyncTasksCover))
690 if isinstance(responseDRCESyncTasksCover, DC_CONSTS.DRCESyncTasksCover):
691 generalResponse = responseDRCESyncTasksCover.eventObject
692 if isinstance(generalResponse, GeneralResponse):
693 deleted = sum([el
for el
in generalResponse.statuses
if el])
696 logger.debug(
"No auto remove candidates or SQL query error!")
698 logger.debug(
"No mandatory auto remove properties in auto_remove_props:\n" + Utils.varDump(autoRemoveProps))
699 except Exception
as err:
700 ExceptionLog.handler(logger, err,
'Error of auto remove operation:')
717 if crawledTime
is not None:
718 collectTime = int((time.time() - crawledTime) * 1000)
723 if strContent
is not None:
724 urlUpdateObj.rawContentMd5 = hashlib.md5(strContent).hexdigest()
725 urlUpdateObj.crawlingTime =
SQLExpression((
"`CrawlingTime` + %s" % str(collectTime)))
726 urlUpdateObj.totalTime =
SQLExpression((
"`TotalTime` + %s" % str(collectTime)))
729 logger.debug(
">>> detectMime = " + str(detectedMime))
730 if httpHeaders
is not None:
731 for header
in httpHeaders:
732 if header.lower() ==
"etag":
734 urlUpdateObj.eTag = httpHeaders[header].split(
',')[0].strip(
"\"'")
735 if detectedMime
is not None and autoDetectMime
is not None:
736 urlUpdateObj.contentType = str(detectedMime)
746 def urlDBSync(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps):
751 sqlQuery =
"SELECT COUNT(*) FROM `%s` WHERE `URLMd5` = '%s'" % \
752 (DB_CONSTS.DC_URLS_TABLE_NAME_TEMPLATE % self.
siteId, batchItem.urlId)
753 logger.debug(
"!!! urlDBSync sqlQuery: " + str(sqlQuery))
755 result = self.
dbWrapper.customRequest(sqlQuery, CrawlerTask.DB_URLS)
756 logger.debug(
"!!! urlDBSync result: " +
varDump(result))
759 if result
is not None and len(result) > 0
and len(result[0]) > 0:
760 logger.debug(
"!!! urlDBSync result[0][0]: " + str(result[0][0]) +
" type: " + str(
type(result[0][0])))
761 isExist = bool(int(result[0][0]) > 0)
765 logger.debug(
"Url already exist in DB.")
769 self.
urlDBSync(batchItem, crawlerType, recrawlPeriod, autoRemoveProps)
771 msg =
"Can't add url from batch." 774 except SyncronizeException, err:
775 logger.debug(
"Can't synchronize url with db: " + str(err))
793 urlUpdateObj.size = size
794 urlUpdateObj.linksI = internalLinksCount
795 urlUpdateObj.linksE = externalLinksCount
796 urlUpdateObj.freq = freq
797 urlUpdateObj.rawContentMd5 = contentMd5
812 ret.type = self.
urlObj.type
814 ret.requestDelay = self.
urlObj.requestDelay
815 ret.httpTimeout = self.
urlObj.httpTimeout
816 ret.httpMethod = formMethods.get(self.
url,
"get")
817 ret.parentMd5 = parentMd5
818 ret.maxURLsFromPage = maxURLsFromPage
821 ret.depth = (depth + 1)
822 ret.contentType = detectedMime
823 ret.priority = self.
urlObj.priority
826 URLProcess.additionalUrlObjInit(ret, self.
siteProperties[
"URLS_FIELDS_INIT"],
845 if re.search(pattern, self.
url)
is not None:
847 ret.type = dc.EventObjects.URL.TYPE_CHAIN
857 updateUrlObjects = []
858 for urlObject
in urlObjects:
860 localUrlObject.urlMd5 = urlObject.urlMd5
861 localUrlObject.type = typeArg
862 updateUrlObjects.append(localUrlObject)
863 if len(updateUrlObjects) > 0:
864 self.
dbWrapper.urlUpdate(updateUrlObjects)
886 status = dc.EventObjects.URL.STATUS_CRAWLED
888 localType = dc.EventObjects.URL.TYPE_FETCHED
889 if processorName == PCONSTS.PROCESSOR_RSS:
890 status = dc.EventObjects.URL.STATUS_NEW
892 localType = dc.EventObjects.URL.TYPE_SINGLE
894 ret = self.
fillRssFieldOneElem(feed.feed, objectUrlUlr, batchItem, status, crawled, localType)
896 for entry
in feed.entries:
897 if hasattr(entry,
'link'):
898 logger.debug(
"entry.link=%s, oldUrl=%s", Utils.varDump(entry.link), Utils.varDump(oldUrl))
899 if entry.link == oldUrl
and ret
is None:
902 logger.debug(
"Getting next candidate URL")
903 elif ret
is not None and "urlObj" in ret
and ret[
"urlObj"]
is None:
923 ret[
"parent_rss_feed"] = urlObj
924 ret[
"parent_rss_feed_urlMd5"] = batchItem.urlId
927 for date
in CONSTS.pubdateFeedNames:
930 dt = DateTimeType.parse(entry[date],
True, logger,
False)
932 logger.debug(
"Convert pubdate from: '%s' to '%s'", str(entry[date]), dt.isoformat(
' '))
933 pubdate = DateTimeType.toUTC(dt).strftime(
"%Y-%m-%d %H:%M:%S")
934 logger.debug(
"pubdate converted to UTC: '%s'", str(pubdate))
937 logger.debug(
"Unsupported date format: '%s'", str(entry[date]))
938 except Exception, err:
939 logger.debug(
"Error: %s, data: '%s'", str(err), str(entry[date]))
941 logger.debug(
"!!! Before apply 'SQLExpression' and 'STAGE_COLLECT_URLS' pubdate: " + str(pubdate))
942 localFilters =
Filters(
None, self.
dbWrapper, batchItem.siteId, 0,
None, Filters.OC_SQLE, Filters.STAGE_COLLECT_URLS)
943 isExistFilter = localFilters.isExist(Filters.STAGE_COLLECT_URLS, Filters.OC_SQLE)
944 logger.debug(
"Filter is exists: " + str(bool(isExistFilter)))
945 if isExistFilter
and pubdate
is not None:
947 if collectURLs.filtersApply(
None,
'', batchItem.depth, self.
dbWrapper, batchItem.siteId,
948 {
'PDATE':str(pubdate)}, Filters.OC_SQLE, Filters.STAGE_COLLECT_URLS,
None,
False):
949 logger.debug(
"Candidate URL matched SQLExpression filter.")
951 logger.debug(
"Candidate URL not matched SQLExpression filter, skipped.")
956 if len(entry.links) > 0
and hasattr(entry.links[0],
'type'):
957 contentType = entry.links[0].type
958 ret[
"urlObj"].status = status
959 ret[
"urlObj"].crawled = crawled
960 ret[
"urlObj"].contentType = contentType
961 ret[
"urlObj"].pDate = pubdate
962 ret[
"urlObj"].type = localType
964 ret[
"urlObj"].size = size
965 ret[
"pubdate"] = pubdate
968 URLProcess.additionalUrlObjInit(ret, self.
siteProperties[
"URLS_FIELDS_INIT"],
985 def urlTemplateApply(self, url, crawlerType, urlTempalteRegular, urlTempalteRealtime, urlTempalteRegularEncode,
986 urlTempalteRealtimeEncode):
988 if crawlerType == dc.EventObjects.Batch.TYPE_REAL_TIME_CRAWLER:
989 if urlTempalteRealtime
is not None:
991 if urlTempalteRealtimeEncode
is not None and bool(int(urlTempalteRealtimeEncode)):
992 encodedUrl = urllib.quote(url)
999 if urlTempalteRegular
is not None:
1001 if urlTempalteRegularEncode
is not None and bool(int(urlTempalteRegularEncode)):
1002 encodedUrl = urllib.quote(url)
1009 logger.debug(
">>> url was replaced ")
1010 logger.debug(
">>> new url = " + ret)
1022 conditionElements = condition.split(
' ', 2)
1023 if len(conditionElements) == 3:
1024 objectName = conditionElements[0]
1025 operationName = conditionElements[1]
1026 value = conditionElements[2]
1027 if len(value) > 0
and (value[0] ==
'"' or value[0] ==
'\''):
1029 if len(value) > 0
and (value[-1] ==
'"' or value[-1] ==
'\''):
1031 objectName = objectName.strip().split(
'.')
1032 if len(objectName) >= 2:
1033 fieldName = objectName[1]
1034 objectName = objectName[0]
1035 if objectName
in conditionalData
and hasattr(conditionalData[objectName], fieldName):
1036 if operationName ==
'=' or operationName ==
"==":
1037 if str(getattr(conditionalData[objectName], fieldName)) == value:
1039 elif operationName ==
"match":
1040 if re.compile(value).match(str(getattr(conditionalData[objectName], fieldName)))
is not None:
1042 elif operationName ==
"search":
1043 if re.compile(value).search(str(getattr(conditionalData[objectName], fieldName)))
is not None:
1045 elif operationName ==
"<>" or operationName ==
"!=":
1046 if str(getattr(conditionalData[objectName], fieldName)) != value:
1048 elif operationName ==
"is" and value ==
'empty':
1049 if str(getattr(conditionalData[objectName], fieldName)) ==
'':
1063 urlInit = json.loads(urlInitParam)
1064 for fieldName
in urlInit:
1065 if hasattr(urlObj, fieldName):
1066 for condition
in urlInit[fieldName][
"conditions"]:
1067 if (isinstance(condition, types.BooleanType)
and condition)
or \
1068 (isinstance(condition, types.StringTypes)
and URLProcess.conditionEvaluate(condition, conditionalData)):
1069 setattr(urlObj, fieldName, urlInit[fieldName][
"value"])
1071 except Exception
as excp:
1072 logger.debug(
">>> some error with URLS_FIELDS_INIT param processing; err=" + str(excp))
string DC_URLS_TABLE_PREFIX
def recrawlUrlUpdateHandler(self, dbWrapper, recrawlUrlUpdateProperty, urlUpdateObj)
def detectUrlMime(self, contentTypeMap=None, urlObj=None)
def additionalUrlObjInit(urlObj, urlInitParam, conditionalData)
def updateURLStatus(self, urlId, status=dc.EventObjects.URL.STATUS_CRAWLED)
def updateURL(self, batchItem, batchId, status=dc.EventObjects.URL.STATUS_CRAWLING)
def checkDictEmptyStrings(inDict, keys)
def fillRssFieldInUrlObj(self, oldUrl, objectUrlUlr, batchItem, processorName, feed, rootFeed=False)
def processURL(self, realUrl, internalLinks, externalLinks, filtersApply=None, siteFilters=None, baseUrl=None)
def createUrlObjForChain(self, pattern, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage)
def updateCollectTimeAndMime(self, detectedMime, batchItem, crawledTime, autoDetectMime, httpHeaders=None, strContent=None)
def autoRemoveURL(autoRemoveProps, recrawlPeriod, urlTable, wrapper)
def isUrlExist(self, recrawlPeriod, urlMd5)
def urlDBSync(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps)
def checkUrlByProtocol(self, url)
def resolveHTTP(self, postForms, headersDict)
def __init__(self, protocols=None)
def getDepthFromUrl(self, urlMd5)
def updateURLFields(self, urlMd5, wrapper, siteId)
def checkFieldsIsNone(self, checkList)
def fillRssFieldOneElem(self, entry, urlObj, batchItem, status, crawled, localType)
def setProtocols(self, protocols=None)
def resolveTableName(self, localSiteId)
def updateURLForFailed(self, errorBit, batchItem, httpCode=CONSTS.HTTP_CODE_400, status=dc.EventObjects.URL.STATUS_CRAWLED, updateUdate=True)
def simpleURLCanonize(self, realUrl)
def urlTemplateApply(self, url, crawlerType, urlTempalteRegular, urlTempalteRealtime, urlTempalteRegularEncode, urlTempalteRealtimeEncode)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def conditionEvaluate(condition, conditionalData)
def readCurrentCnt(self, maxURLs)
def addURLFromBatchToDB(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps)
def createUrlObjForCollectURLs(self, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage)
def resetErrorMask(self, batchItem)
string URL_TEMPLATE_CONST
def updateAdditionProps(self, internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5)
def updateCrawledURL(self, crawledResource, batchItem, contentSize, status=dc.EventObjects.URL.STATUS_CRAWLED)
def checkUrlByPath(self, url)
def updateTypeForURLObjects(self, urlObjects, typeArg=dc.EventObjects.URL.TYPE_CHAIN)