HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.URLProcess.URLProcess Class Reference
Inheritance diagram for dc_crawler.URLProcess.URLProcess:
Collaboration diagram for dc_crawler.URLProcess.URLProcess:

Public Member Functions

def __init__ (self, protocols=None)
 
def setProtocols (self, protocols=None)
 
def checkUrlByPath (self, url)
 
def checkUrlByProtocol (self, url)
 
def checkFieldsIsNone (self, checkList)
 
def resolveTableName (self, localSiteId)
 
def readCurrentCnt (self, maxURLs)
 
def simpleURLCanonize (self, realUrl)
 
def processURL (self, realUrl, internalLinks, externalLinks, filtersApply=None, siteFilters=None, baseUrl=None)
 
def isUrlExist (self, recrawlPeriod, urlMd5)
 
def updateURLFields (self, urlMd5, wrapper, siteId)
 
def recrawlUrlUpdateHandler (self, dbWrapper, recrawlUrlUpdateProperty, urlUpdateObj)
 
def detectUrlMime (self, contentTypeMap=None, urlObj=None)
 
def getDepthFromUrl (self, urlMd5)
 
def updateURLForFailed (self, errorBit, batchItem, httpCode=CONSTS.HTTP_CODE_400, status=dc.EventObjects.URL.STATUS_CRAWLED, updateUdate=True)
 
def getRealUrl (self)
 
def resolveHTTP (self, postForms, headersDict)
 
def updateCrawledURL (self, crawledResource, batchItem, contentSize, status=dc.EventObjects.URL.STATUS_CRAWLED)
 
def updateURL (self, batchItem, batchId, status=dc.EventObjects.URL.STATUS_CRAWLING)
 
def updateURLStatus (self, urlId, status=dc.EventObjects.URL.STATUS_CRAWLED)
 
def resetErrorMask (self, batchItem)
 
def addURLFromBatchToDB (self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps)
 
def updateCollectTimeAndMime (self, detectedMime, batchItem, crawledTime, autoDetectMime, httpHeaders=None, strContent=None)
 
def urlDBSync (self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps)
 
def updateAdditionProps (self, internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5)
 
def createUrlObjForCollectURLs (self, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage)
 
def createUrlObjForChain (self, pattern, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage)
 
def updateTypeForURLObjects (self, urlObjects, typeArg=dc.EventObjects.URL.TYPE_CHAIN)
 
def fillRssFieldInUrlObj (self, oldUrl, objectUrlUlr, batchItem, processorName, feed, rootFeed=False)
 
def fillRssFieldOneElem (self, entry, urlObj, batchItem, status, crawled, localType)
 
def urlTemplateApply (self, url, crawlerType, urlTempalteRegular, urlTempalteRealtime, urlTempalteRegularEncode, urlTempalteRealtimeEncode)
 

Static Public Member Functions

def checkDictEmptyStrings (inDict, keys)
 
def autoRemoveURL (autoRemoveProps, recrawlPeriod, urlTable, wrapper)
 
def conditionEvaluate (condition, conditionalData)
 
def additionalUrlObjInit (urlObj, urlInitParam, conditionalData)
 

Public Attributes

 isUpdateCollection
 
 urlObj
 
 url
 
 dbWrapper
 
 siteId
 
 site
 
 urlTable
 
 protocolsList
 
 siteProperties
 
 normMask
 

Static Public Attributes

string DC_URLS_TABLE_PREFIX = "urls_"
 
int DETECT_MIME_TIMEOUT = 1
 
 PATTERN_WITH_PROTOCOL = re.compile('[a-zA-Z]+:(//)?')
 
string URL_TEMPLATE_CONST = "%URL%"
 
string PROTOCOL_PREFIX = "://"
 
list DEFAULT_PROTOCOLS = ["http", "https"]
 

Detailed Description

Definition at line 48 of file URLProcess.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.URLProcess.URLProcess.__init__ (   self,
  protocols = None 
)

Definition at line 58 of file URLProcess.py.

58  def __init__(self, protocols=None):
59  self.isUpdateCollection = False
60  self.urlObj = None
61  self.url = None
62  self.dbWrapper = None
63  self.siteId = None
64  self.site = None
65  self.urlTable = None
66  self.protocolsList = self.DEFAULT_PROTOCOLS
67  self.siteProperties = None
68  self.setProtocols(protocols)
69  self.normMask = UrlNormalizator.NORM_DEFAULT
70 
71 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ additionalUrlObjInit()

def dc_crawler.URLProcess.URLProcess.additionalUrlObjInit (   urlObj,
  urlInitParam,
  conditionalData 
)
static

Definition at line 1061 of file URLProcess.py.

1061  def additionalUrlObjInit(urlObj, urlInitParam, conditionalData):
1062  try:
1063  urlInit = json.loads(urlInitParam)
1064  for fieldName in urlInit:
1065  if hasattr(urlObj, fieldName):
1066  for condition in urlInit[fieldName]["conditions"]:
1067  if (isinstance(condition, types.BooleanType) and condition) or \
1068  (isinstance(condition, types.StringTypes) and URLProcess.conditionEvaluate(condition, conditionalData)):
1069  setattr(urlObj, fieldName, urlInit[fieldName]["value"])
1070  break
1071  except Exception as excp:
1072  logger.debug(">>> some error with URLS_FIELDS_INIT param processing; err=" + str(excp))
1073 

◆ addURLFromBatchToDB()

def dc_crawler.URLProcess.URLProcess.addURLFromBatchToDB (   self,
  batchItem,
  crawlerType,
  recrawlPeriod,
  autoRemoveProps 
)

Definition at line 567 of file URLProcess.py.

567  def addURLFromBatchToDB(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps):
568  # variable for result
569  ret = True
570 
571  if self.dbWrapper is not None:
572  self.checkFieldsIsNone(["dbWrapper", "siteId", "urlTable"])
573  try:
574  siteStatusObj = dc.EventObjects.SiteStatus(Utils.autoFillSiteId(self.siteId, logger))
575  result = self.dbWrapper.siteStatus(siteStatusObj)
576  if result is not None:
577  maxURLs = result.maxURLs
578  if ((crawlerType != dc.EventObjects.Batch.TYPE_REAL_TIME_CRAWLER) and \
579  (result.state != dc.EventObjects.Site.STATE_ACTIVE)) or \
580  ((crawlerType == dc.EventObjects.Batch.TYPE_REAL_TIME_CRAWLER) and \
581  (result.state == dc.EventObjects.Site.STATE_DISABLED)):
582  logger.debug("Warning: Batch CrawlerType: %s, site state is %s but not STATE_ACTIVE!", crawlerType,
583  str(result.state))
584  raise SyncronizeException("Site state is not active, state=" + str(result.state))
585 
586  if (result.maxErrors > 0) and (result.errors > result.maxErrors):
587  msg = "Site maxErrors limit " + str(result.maxErrors) + " reached " + str(result.errors)
588  logger.debug(msg)
589  raise SyncronizeException(msg)
590 
591  # Check the limit of the maxURLs for count of active URLs (not migrated with batches)
592  if DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE_ACTIVE in autoRemoveProps:
593  where = autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE_ACTIVE]
594  else:
595  where = "NOT (`Status`=4 AND `Crawled`=0 AND `Processed`=0)"
596  query = "SELECT COUNT(*) FROM `%s` " % self.urlTable
597  query += "WHERE " + where
598  result = self.dbWrapper.customRequest(query, CrawlerTask.DB_URLS)
599  if len(result) > 0 and len(result[0]) > 0:
600  activeURLs = result[0][0]
601  logger.debug("Active URLs count: " + str(activeURLs) + ", maxURLs: " + str(maxURLs))
602  if (maxURLs > 0) and (activeURLs >= maxURLs):
603  autoRemoved = URLProcess.autoRemoveURL(autoRemoveProps, recrawlPeriod, self.urlTable, self.dbWrapper)
604  if autoRemoved < 1:
605  msg = "Active URLs:" + str(activeURLs) + " > MaxURLs:" + str(maxURLs) + " and no one auto-removed!"
606  logger.debug(msg)
607  raise SyncronizeException(msg)
608  else:
609  logger.debug(str(autoRemoved) + " URLs auto-removed to insert new URL from batch")
610  else:
611  msg = "Error of query processing, no rows returned:\n" + query
612  logger.debug(msg)
613  raise SyncronizeException(msg)
614 
615  batchItem.urlObj.CDate = str(datetime.datetime.now())
616  batchItem.urlObj.UDate = batchItem.urlObj.CDate
617  batchItem.urlObj.tcDate = batchItem.urlObj.CDate
618  batchItem.urlObj.batchId = 0 # self.batch.id
619  result = self.dbWrapper.urlNew([batchItem.urlObj])
620  logger.debug("rows_count: %s", result)
621  self.isUpdateCollection = True
622  # self.updateCollectedURLs()
623  else:
624  raise SyncronizeException("Execute last SQL query(SiteStatus), no rows returned:\n")
625  except Exception as err:
626  logger.debug('Error add new url from batch (another host source):' + str(err))
627  ret = False
628  raise err
629 
630  return ret
631 
632 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ autoRemoveURL()

def dc_crawler.URLProcess.URLProcess.autoRemoveURL (   autoRemoveProps,
  recrawlPeriod,
  urlTable,
  wrapper 
)
static

Definition at line 660 of file URLProcess.py.

660  def autoRemoveURL(autoRemoveProps, recrawlPeriod, urlTable, wrapper):
661  ret = 0
662  if wrapper is not None:
663  try:
664  # logger.debug("Auto remove properties:\n%s", varDump(autoRemoveProps))
665  # If defined auto remove properties and set in proper values
666  if URLProcess.checkDictEmptyStrings(autoRemoveProps, [DC_CONSTS.SITE_PROP_AUTO_REMOVE_RESOURCES,
667  DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE,
668  DC_CONSTS.SITE_PROP_AUTO_REMOVE_ORDER]):
669  # Select candidates to remove
670  query = "SELECT Site_Id, URLMd5 FROM %s WHERE %s ORDER BY %s LIMIT %s" % \
671  (urlTable,
672  autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_WHERE].replace("%RecrawlPeriod%", str(recrawlPeriod)),
673  autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_ORDER],
674  autoRemoveProps[DC_CONSTS.SITE_PROP_AUTO_REMOVE_RESOURCES])
675  logger.debug("SQL to select auto remove candidates: %s", query)
676  result = wrapper.customRequest(query, CrawlerTask.DB_URLS)
677  if len(result) > 0:
678  urlsToDelete = []
679  for row in result:
680  # Create new URLDelete object
681  urlDelete = dc.EventObjects.URLDelete(row[0], row[1], dc.EventObjects.URLStatus.URL_TYPE_MD5,
682  reason=dc.EventObjects.URLDelete.REASON_CRAWLER_AUTOREMOVE)
683  urlsToDelete.append(urlDelete)
684  logger.debug("URL added to auto remove URLMd5:[%s]", row[1])
685  drceSyncTasksCoverObj = DC_CONSTS.DRCESyncTasksCover(DC_CONSTS.EVENT_TYPES.URL_DELETE, urlsToDelete)
686  responseDRCESyncTasksCover = wrapper.process(drceSyncTasksCoverObj)
687  logger.debug("Response from db-task module on URLDelete operation:\n%s", \
688  Utils.varDump(responseDRCESyncTasksCover))
689  deleted = 0
690  if isinstance(responseDRCESyncTasksCover, DC_CONSTS.DRCESyncTasksCover):
691  generalResponse = responseDRCESyncTasksCover.eventObject
692  if isinstance(generalResponse, GeneralResponse):
693  deleted = sum([el for el in generalResponse.statuses if el])
694  ret = deleted
695  else:
696  logger.debug("No auto remove candidates or SQL query error!")
697  else:
698  logger.debug("No mandatory auto remove properties in auto_remove_props:\n" + Utils.varDump(autoRemoveProps))
699  except Exception as err:
700  ExceptionLog.handler(logger, err, 'Error of auto remove operation:')
701 
702  return ret
703 
704 
Here is the call graph for this function:

◆ checkDictEmptyStrings()

def dc_crawler.URLProcess.URLProcess.checkDictEmptyStrings (   inDict,
  keys 
)
static

Definition at line 639 of file URLProcess.py.

639  def checkDictEmptyStrings(inDict, keys):
640  ret = False
641  for key in keys:
642  if key in inDict and inDict[key] != '':
643  ret = True
644  else:
645  ret = False
646  break
647  return ret
648 
649 
650 

◆ checkFieldsIsNone()

def dc_crawler.URLProcess.URLProcess.checkFieldsIsNone (   self,
  checkList 
)

Definition at line 108 of file URLProcess.py.

108  def checkFieldsIsNone(self, checkList):
109  # for field in self.__dict__:
110  # if field in checkList and (not hasattr(self, field) or getattr(self, field) is None):
111  # raise Exception(">>> [CollectURLs] Mandatory field must be initialized, field Name = " + field)
112  for name in checkList:
113  if not hasattr(self, name) or getattr(self, name) is None:
114  raise Exception("Some mandatory field `%s` must be initialized!", name)
115 
116 
Here is the caller graph for this function:

◆ checkUrlByPath()

def dc_crawler.URLProcess.URLProcess.checkUrlByPath (   self,
  url 
)

Definition at line 85 of file URLProcess.py.

85  def checkUrlByPath(self, url):
86  ret = False
87  position = url.find(self.PROTOCOL_PREFIX)
88  if position > 0 and url.find('/') == (position + 1):
89  ret = True
90  return ret
91 
92 
Here is the caller graph for this function:

◆ checkUrlByProtocol()

def dc_crawler.URLProcess.URLProcess.checkUrlByProtocol (   self,
  url 
)

Definition at line 95 of file URLProcess.py.

95  def checkUrlByProtocol(self, url):
96  ret = False
97  for elem in self.protocolsList:
98  if url.lower().startswith(elem + self.PROTOCOL_PREFIX):
99  ret = True
100  break
101  if not ret:
102  logger.debug(">>> URL skiped by protocol = " + url)
103  return ret
104 
105 
Here is the caller graph for this function:

◆ conditionEvaluate()

def dc_crawler.URLProcess.URLProcess.conditionEvaluate (   condition,
  conditionalData 
)
static

Definition at line 1020 of file URLProcess.py.

1020  def conditionEvaluate(condition, conditionalData):
1021  ret = False
1022  conditionElements = condition.split(' ', 2)
1023  if len(conditionElements) == 3:
1024  objectName = conditionElements[0]
1025  operationName = conditionElements[1]
1026  value = conditionElements[2]
1027  if len(value) > 0 and (value[0] == '"' or value[0] == '\''):
1028  value = value[1:]
1029  if len(value) > 0 and (value[-1] == '"' or value[-1] == '\''):
1030  value = value[0:-1]
1031  objectName = objectName.strip().split('.')
1032  if len(objectName) >= 2:
1033  fieldName = objectName[1]
1034  objectName = objectName[0]
1035  if objectName in conditionalData and hasattr(conditionalData[objectName], fieldName):
1036  if operationName == '=' or operationName == "==":
1037  if str(getattr(conditionalData[objectName], fieldName)) == value:
1038  ret = True
1039  elif operationName == "match":
1040  if re.compile(value).match(str(getattr(conditionalData[objectName], fieldName))) is not None:
1041  ret = True
1042  elif operationName == "search":
1043  if re.compile(value).search(str(getattr(conditionalData[objectName], fieldName))) is not None:
1044  ret = True
1045  elif operationName == "<>" or operationName == "!=":
1046  if str(getattr(conditionalData[objectName], fieldName)) != value:
1047  ret = True
1048  elif operationName == "is" and value == 'empty':
1049  if str(getattr(conditionalData[objectName], fieldName)) == '':
1050  ret = True
1051 
1052  return ret
1053 
1054 

◆ createUrlObjForChain()

def dc_crawler.URLProcess.URLProcess.createUrlObjForChain (   self,
  pattern,
  urlMd5,
  formMethods,
  parentMd5,
  depth,
  detectedMime,
  maxURLsFromPage 
)

Definition at line 841 of file URLProcess.py.

841  def createUrlObjForChain(self, pattern, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage):
842  ret = None
843  self.checkFieldsIsNone(["url"])
844  # logger.debug(">>> chain patter is = " + str(pattern) + " url = " + self.url)
845  if re.search(pattern, self.url) is not None:
846  ret = self.createUrlObjForCollectURLs(urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage)
847  ret.type = dc.EventObjects.URL.TYPE_CHAIN
848  return ret
849 
850 
Here is the call graph for this function:

◆ createUrlObjForCollectURLs()

def dc_crawler.URLProcess.URLProcess.createUrlObjForCollectURLs (   self,
  urlMd5,
  formMethods,
  parentMd5,
  depth,
  detectedMime,
  maxURLsFromPage 
)

Definition at line 809 of file URLProcess.py.

809  def createUrlObjForCollectURLs(self, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage):
810  self.checkFieldsIsNone(["url", "siteId", "urlObj"])
811  ret = dc.EventObjects.URL(self.siteId, self.url, normalizeMask=self.normMask)
812  ret.type = self.urlObj.type
813  ret.urlMd5 = urlMd5
814  ret.requestDelay = self.urlObj.requestDelay
815  ret.httpTimeout = self.urlObj.httpTimeout
816  ret.httpMethod = formMethods.get(self.url, "get")
817  ret.parentMd5 = parentMd5
818  ret.maxURLsFromPage = maxURLsFromPage
819  ret.tcDate = SQLExpression("NOW()")
820  ret.UDate = SQLExpression("NOW()")
821  ret.depth = (depth + 1)
822  ret.contentType = detectedMime
823  ret.priority = self.urlObj.priority
824  # TODO Additional URL init
825  if self.siteProperties is not None and "URLS_FIELDS_INIT" in self.siteProperties:
826  URLProcess.additionalUrlObjInit(ret, self.siteProperties["URLS_FIELDS_INIT"],
827  {"site": self.site, "parent": self.urlObj})
828  return ret
829 
830 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ detectUrlMime()

def dc_crawler.URLProcess.URLProcess.detectUrlMime (   self,
  contentTypeMap = None,
  urlObj = None 
)

Definition at line 339 of file URLProcess.py.

339  def detectUrlMime(self, contentTypeMap=None, urlObj=None):
340  del urlObj
341  self.checkFieldsIsNone(["url"])
342  ret = ''
343  try:
344  res = requests.head(self.url, timeout=self.DETECT_MIME_TIMEOUT)
345  ret = res.headers.get('content-type', '').lower()
346  if contentTypeMap is not None and ret in contentTypeMap:
347  logger.debug(">>> Mime type replaced from %s to %s", ret, contentTypeMap[ret])
348  ret = contentTypeMap[ret]
349  except Exception:
350  logger.warn("detect mime type for %s failed", self.url, exc_info=True)
351  return ret
352 
353 
Here is the call graph for this function:

◆ fillRssFieldInUrlObj()

def dc_crawler.URLProcess.URLProcess.fillRssFieldInUrlObj (   self,
  oldUrl,
  objectUrlUlr,
  batchItem,
  processorName,
  feed,
  rootFeed = False 
)

Definition at line 877 of file URLProcess.py.

877  def fillRssFieldInUrlObj(self, oldUrl, objectUrlUlr, batchItem, processorName, feed, rootFeed=False):
878  # logger.debug("oldUrl=%s\nobjectUrlUlr=%s\nbatchItem=%s\nprocessorName=%s\nfeed=%s\n",
879  # Utils.varDump(oldUrl),
880  # Utils.varDump(objectUrlUlr),
881  # Utils.varDump(batchItem),
882  # Utils.varDump(processorName),
883  # str(feed))
884  self.checkFieldsIsNone(["url", "siteId", "urlObj"])
885  ret = None
886  status = dc.EventObjects.URL.STATUS_CRAWLED
887  crawled = 1
888  localType = dc.EventObjects.URL.TYPE_FETCHED
889  if processorName == PCONSTS.PROCESSOR_RSS:
890  status = dc.EventObjects.URL.STATUS_NEW
891  crawled = 0
892  localType = dc.EventObjects.URL.TYPE_SINGLE
893  if rootFeed:
894  ret = self.fillRssFieldOneElem(feed.feed, objectUrlUlr, batchItem, status, crawled, localType)
895  else:
896  for entry in feed.entries:
897  if hasattr(entry, 'link'):
898  logger.debug("entry.link=%s, oldUrl=%s", Utils.varDump(entry.link), Utils.varDump(oldUrl))
899  if entry.link == oldUrl and ret is None:
900  ret = self.fillRssFieldOneElem(entry, objectUrlUlr, batchItem, status, crawled, localType)
901  if ret is None:
902  logger.debug("Getting next candidate URL")
903  elif ret is not None and "urlObj" in ret and ret["urlObj"] is None:
904  ret = self.fillRssFieldOneElem(entry, objectUrlUlr, batchItem, status, crawled, localType)
905 
906  return ret
907 
908 
Here is the call graph for this function:

◆ fillRssFieldOneElem()

def dc_crawler.URLProcess.URLProcess.fillRssFieldOneElem (   self,
  entry,
  urlObj,
  batchItem,
  status,
  crawled,
  localType 
)

Definition at line 918 of file URLProcess.py.

918  def fillRssFieldOneElem(self, entry, urlObj, batchItem, status, crawled, localType):
919  # variable for result
920  ret = {}
921  ret["entry"] = entry
922  ret["urlObj"] = dc.EventObjects.URL(self.siteId, self.url, normalizeMask=self.normMask)
923  ret["parent_rss_feed"] = urlObj
924  ret["parent_rss_feed_urlMd5"] = batchItem.urlId
925  # Getting pubdate from feed
926  pubdate = None
927  for date in CONSTS.pubdateFeedNames:
928  if date in entry:
929  try:
930  dt = DateTimeType.parse(entry[date], True, logger, False)
931  if dt is not None:
932  logger.debug("Convert pubdate from: '%s' to '%s'", str(entry[date]), dt.isoformat(' '))
933  pubdate = DateTimeType.toUTC(dt).strftime("%Y-%m-%d %H:%M:%S")
934  logger.debug("pubdate converted to UTC: '%s'", str(pubdate))
935  break
936  except TypeError:
937  logger.debug("Unsupported date format: '%s'", str(entry[date]))
938  except Exception, err:
939  logger.debug("Error: %s, data: '%s'", str(err), str(entry[date]))
940 
941  logger.debug("!!! Before apply 'SQLExpression' and 'STAGE_COLLECT_URLS' pubdate: " + str(pubdate))
942  localFilters = Filters(None, self.dbWrapper, batchItem.siteId, 0, None, Filters.OC_SQLE, Filters.STAGE_COLLECT_URLS)
943  isExistFilter = localFilters.isExist(Filters.STAGE_COLLECT_URLS, Filters.OC_SQLE)
944  logger.debug("Filter is exists: " + str(bool(isExistFilter)))
945  if isExistFilter and pubdate is not None:
946  collectURLs = CollectURLs()
947  if collectURLs.filtersApply(None, '', batchItem.depth, self.dbWrapper, batchItem.siteId,
948  {'PDATE':str(pubdate)}, Filters.OC_SQLE, Filters.STAGE_COLLECT_URLS, None, False):
949  logger.debug("Candidate URL matched SQLExpression filter.")
950  else:
951  logger.debug("Candidate URL not matched SQLExpression filter, skipped.")
952  # ret["urlObj"] = None
953  ret = None
954  return ret
955 
956  if len(entry.links) > 0 and hasattr(entry.links[0], 'type'):
957  contentType = entry.links[0].type
958  ret["urlObj"].status = status
959  ret["urlObj"].crawled = crawled
960  ret["urlObj"].contentType = contentType
961  ret["urlObj"].pDate = pubdate
962  ret["urlObj"].type = localType
963  size = len(str(ret))
964  ret["urlObj"].size = size
965  ret["pubdate"] = pubdate
966  # TODO Additional URL init
967  if self.siteProperties is not None and "URLS_FIELDS_INIT" in self.siteProperties:
968  URLProcess.additionalUrlObjInit(ret, self.siteProperties["URLS_FIELDS_INIT"],
969  {"site": self.site, "parent": self.urlObj})
970 
971  # logger.debug(">>>> ret[\"urlObj\"].pDate = " + str(ret["urlObj"].pDate))
972 
973  return ret
974 
975 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getDepthFromUrl()

def dc_crawler.URLProcess.URLProcess.getDepthFromUrl (   self,
  urlMd5 
)

Definition at line 357 of file URLProcess.py.

357  def getDepthFromUrl(self, urlMd5):
358  ret = 0
359  if self.dbWrapper is not None:
360  self.checkFieldsIsNone(["dbWrapper", "siteId"])
361 
362  urlStatusObj = dc.EventObjects.URLStatus(self.siteId, urlMd5)
363  result = self.dbWrapper.urlStatus(urlStatusObj, True)
364  if len(result) > 0 and isinstance(result[0], dc.EventObjects.URL):
365  ret = result[0].depth
366 
367  return ret
368 
369 
Here is the call graph for this function:

◆ getRealUrl()

def dc_crawler.URLProcess.URLProcess.getRealUrl (   self)

Definition at line 411 of file URLProcess.py.

411  def getRealUrl(self):
412  self.checkFieldsIsNone(["url"])
413  if self.url.startswith("http%3A") or self.url.startswith("https%3A"):
414  ret = urllib.unquote(self.url.url).decode('utf-8')
415  else:
416  ret = self.url.decode('utf8')
417  return ret
418 
419 
Here is the call graph for this function:

◆ isUrlExist()

def dc_crawler.URLProcess.URLProcess.isUrlExist (   self,
  recrawlPeriod,
  urlMd5 
)

Definition at line 208 of file URLProcess.py.

208  def isUrlExist(self, recrawlPeriod, urlMd5):
209  # variable for result
210  ret = False
211 
212  if self.dbWrapper is not None:
213  self.checkFieldsIsNone(["url", "dbWrapper", "siteId", "urlTable"])
214 
215  if "RECRAWL_URL_AGE_EXPRESSION" in self.siteProperties and self.siteProperties["RECRAWL_URL_AGE_EXPRESSION"] != "":
216  ageExpr = self.siteProperties["RECRAWL_URL_AGE_EXPRESSION"].replace("%RECRAWL_PERIOD%", str(recrawlPeriod))
217  else:
218  ageExpr = "(DATE_ADD(UDate, INTERVAL %s MINUTE)-NOW())" % (str(recrawlPeriod))
219  query = "SELECT COUNT(*), %s, `Type` FROM `%s` WHERE `URLMd5` = '%s'" % (ageExpr, self.urlTable, urlMd5)
220  result = self.dbWrapper.customRequest(query, CrawlerTask.DB_URLS)
221  if result is not None and len(result) > 0 and len(result[0]) > 0 and result[0][0] > 0:
222  if recrawlPeriod == 0 or result[0][1] > 0 or result[0][2] == dc.EventObjects.URL.TYPE_FETCHED or \
223  ("RECRAWL_NO_ROOT_URLS" in self.siteProperties and self.siteProperties["RECRAWL_NO_ROOT_URLS"] == "0"):
224  logger.debug("URL skipped, exists and re-crawling not active, time not reached or URL Type is " +
225  "RSS feed (not to fetch)\n %s %s", self.url, urlMd5)
226  ret = True
227  else:
228  self.updateURLFields(urlMd5, self.dbWrapper, self.siteId)
229  logger.debug("URL state updated to NEW because re-crawling\n %s %s", self.url, urlMd5)
230  ret = True
231  else:
232  logger.debug("URL %s treated as new\n %s", self.url, urlMd5)
233 
234  return ret
235 
236 
Here is the call graph for this function:

◆ processURL()

def dc_crawler.URLProcess.URLProcess.processURL (   self,
  realUrl,
  internalLinks,
  externalLinks,
  filtersApply = None,
  siteFilters = None,
  baseUrl = None 
)

Definition at line 157 of file URLProcess.py.

157  def processURL(self, realUrl, internalLinks, externalLinks, filtersApply=None, siteFilters=None, baseUrl=None):
158  self.checkFieldsIsNone(["urlObj", "siteId", "url"])
159  retUrl = None
160  retContinue = False
161 
162  logger.debug("URL: %s", self.url)
163 
164  if self.urlObj.type == dc.EventObjects.URL.TYPE_SINGLE or not self.url:
165  logger.debug("URL type is TYPE_SINGLE - not collect urls. Skip url.")
166  retContinue = True
167 
168  if not retContinue:
169  retUrl = UrlNormalize.execute(siteProperties=self.siteProperties, base=baseUrl, url=self.url, supportProtocols=self.protocolsList, log=logger)
170  if retUrl is not None:
171  localFilters = None
172  protocolAllowed = True
173  if filtersApply is not None:
174  # Stage 'collect urls protocols' and operation code 'regular expression'
175  logger.debug(">>> Filters() (3.1) siteFilters: " + str(siteFilters))
176  localFilters = Filters(siteFilters, self.dbWrapper, self.siteId, 0, None, Filters.OC_RE, \
177  Filters.STAGE_COLLECT_URLS_PROTOCOLS)
178 
179  if localFilters.isExistStage(Filters.STAGE_COLLECT_URLS_PROTOCOLS):
180  resFilterApply = filtersApply(siteFilters, retUrl, 0, self.dbWrapper, self.siteId,
181  None, Filters.OC_RE, Filters.STAGE_COLLECT_URLS_PROTOCOLS)
182  logger.debug("Filter apply: " + str(resFilterApply))
183 
184  protocolAllowed = self.checkUrlByProtocol(retUrl)
185  logger.debug("checkUrlByProtocol return: " + str(protocolAllowed))
186  logger.debug("retUrl: " + str(retUrl))
187  logger.debug("realUrl: " + str(realUrl))
188 
189  if protocolAllowed:
190  if Utils.parseHost(retUrl) == Utils.parseHost(realUrl):
191  internalLinks.append(retUrl)
192  logger.debug("URL classified as internal")
193  elif Utils.parseHost(retUrl):
194  externalLinks.append(retUrl)
195  logger.debug("URL classified as external")
196  else: # not valid url like http://
197  retContinue = True
198  else:
199  retContinue = True
200  else:
201  logger.debug(">>> Bad url normalization, url = " + str(retUrl))
202  retContinue = True
203  return retUrl, retContinue
204 
205 
Here is the call graph for this function:

◆ readCurrentCnt()

def dc_crawler.URLProcess.URLProcess.readCurrentCnt (   self,
  maxURLs 
)

Definition at line 126 of file URLProcess.py.

126  def readCurrentCnt(self, maxURLs):
127  currentCnt = 0
128  if self.dbWrapper is not None:
129  self.checkFieldsIsNone(["dbWrapper", "urlTable"])
130 
131  if maxURLs > 0:
132  countsql = "SELECT COUNT(*) AS cnt FROM `%s` WHERE NOT (Status=4 AND Crawled=0 AND Processed=0)" % \
133  (self.urlTable,)
134  result = self.dbWrapper.customRequest(countsql, CrawlerTask.DB_URLS)
135  if result is not None and len(result) > 0 and len(result[0]) > 0:
136  currentCnt = result[0][0]
137  else:
138  currentCnt = 0
139 
140  logger.debug("!!! maxURLs = %s, currentCnt = %s", str(maxURLs), str(currentCnt))
141 
142  return currentCnt
143 
144 
Here is the call graph for this function:

◆ recrawlUrlUpdateHandler()

def dc_crawler.URLProcess.URLProcess.recrawlUrlUpdateHandler (   self,
  dbWrapper,
  recrawlUrlUpdateProperty,
  urlUpdateObj 
)

Definition at line 298 of file URLProcess.py.

298  def recrawlUrlUpdateHandler(self, dbWrapper, recrawlUrlUpdateProperty, urlUpdateObj):
299  if dbWrapper is not None:
300  propertyStruct = None
301  try:
302  propertyStruct = json.loads(recrawlUrlUpdateProperty)
303  except Exception, err:
304  logger.error("Load property 'RECRAWL_URL_UPDATE' was failed, error: %s", str(err))
305 
306  # If load json was successfully
307  if propertyStruct is not None:
308  try:
309  # list elements or one element?
310  for pattern, rules in propertyStruct.items():
311  if re.search(pattern, self.url) is not None:
312  # Update data accord to parameters
313  if "new" in rules and int(rules["new"]) > 0:
314  saveAffectDB = dbWrapper.affect_db
315  dbWrapper.affect_db = True
316  dbWrapper.urlNew(self.urlObj)
317  dbWrapper.affect_db = saveAffectDB
318 
319  if "fields" in rules and isinstance(rules["fields"], dict):
320  for key, value in rules["fields"].items():
321  if key in DB_CONSTS.URLTableDict.values():
322  for urlUpdateObjName, DBSchemaName in DB_CONSTS.URLTableDict.items():
323  if key == DBSchemaName and hasattr(urlUpdateObj, urlUpdateObjName):
324  setattr(urlUpdateObj, urlUpdateObjName, value)
325  logger.debug("For '" + str(DBSchemaName) + "' found attribute 'UrlUpdate." + \
326  str(urlUpdateObjName) + "' and set value: " + str(value) + \
327  " type: " + str(type(value)))
328  break
329  else:
330  logger.debug("Wrong DB schema field name '" + str(key) + "' in property 'RECRAWL_URL_UPDATE'")
331 
332  except Exception, err:
333  logger.error("Usage property 'RECRAWL_URL_UPDATE' was failed, error: %s", str(err))
334 
335 
Here is the caller graph for this function:

◆ resetErrorMask()

def dc_crawler.URLProcess.URLProcess.resetErrorMask (   self,
  batchItem 
)

Definition at line 550 of file URLProcess.py.

550  def resetErrorMask(self, batchItem):
551  if self.dbWrapper is not None:
552  self.checkFieldsIsNone(["dbWrapper", "siteId"])
553  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, batchItem.urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
554  normalizeMask=self.normMask)
555  urlUpdateObj.errorMask = batchItem.urlObj.errorMask = 0
556  urlUpdateObj.tcDate = batchItem.urlObj.tcDate = SQLExpression("NOW()")
557  urlUpdateObj.UDate = batchItem.urlObj.UDate = SQLExpression("NOW()")
558  self.dbWrapper.urlUpdate(urlUpdateObj)
559 
560 
Here is the call graph for this function:

◆ resolveHTTP()

def dc_crawler.URLProcess.URLProcess.resolveHTTP (   self,
  postForms,
  headersDict 
)

Definition at line 425 of file URLProcess.py.

425  def resolveHTTP(self, postForms, headersDict):
426  self.checkFieldsIsNone(["urlObj"])
427  logger.debug("headersDict: %s", str(headersDict))
428  postData = None
429  try:
430  method = self.urlObj.httpMethod.lower()
431  except Exception:
432  method = "get"
433  if method == "post":
434  postData = postForms
435  logger.debug("use post, post_data:%s", postData)
436 # else:
437 # logger.debug("last modified: <<%s>>", str((self.urlObj.lastModified)))
438 # if str(self.urlObj.lastModified) != "None" and str(self.urlObj.lastModified) != "NULL":
439 # logger.debug("If-Modified-Since: <<%s>>", self.urlObj.lastModified)
440 # headersDict["If-Modified-Since"] = \
441 # Utils.convertToHttpDateFmt(datetime.datetime.strptime(str(self.urlObj.lastModified), "%Y-%m-%d %H:%M:%S"))
442  return postData
443 
444 
Here is the call graph for this function:

◆ resolveTableName()

def dc_crawler.URLProcess.URLProcess.resolveTableName (   self,
  localSiteId 
)

Definition at line 119 of file URLProcess.py.

119  def resolveTableName(self, localSiteId):
120  self.urlTable = self.DC_URLS_TABLE_PREFIX + localSiteId
121  return self.urlTable
122 
123 

◆ setProtocols()

def dc_crawler.URLProcess.URLProcess.setProtocols (   self,
  protocols = None 
)

Definition at line 74 of file URLProcess.py.

74  def setProtocols(self, protocols=None):
75  if protocols is not None:
76  try:
77  self.protocolsList = json.loads(protocols)
78  except Exception:
79  self.protocolsList = []
80  logger.debug(">>> THAT PROTOCOLS = " + str(self.protocolsList))
81 
82 

◆ simpleURLCanonize()

def dc_crawler.URLProcess.URLProcess.simpleURLCanonize (   self,
  realUrl 
)

Definition at line 145 of file URLProcess.py.

145  def simpleURLCanonize(self, realUrl):
146  self.checkFieldsIsNone(["url"])
147  if not self.checkUrlByPath(self.url):
148  self.url = urlparse.urljoin(realUrl, self.url)
149  # normalization
150  retUrl = Utils.UrlNormalizator.normalize(self.url, self.protocolsList, self.normMask)
151  return retUrl
152 
153 
Here is the call graph for this function:

◆ updateAdditionProps()

def dc_crawler.URLProcess.URLProcess.updateAdditionProps (   self,
  internalLinksCount,
  externalLinksCount,
  batchItem,
  size,
  freq,
  contentMd5 
)

Definition at line 787 of file URLProcess.py.

787  def updateAdditionProps(self, internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5):
788  if self.dbWrapper is not None:
789  self.checkFieldsIsNone(["dbWrapper", "siteId"])
790  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, batchItem.urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
791  normalizeMask=self.normMask)
792  urlUpdateObj.tcDate = SQLExpression("NOW()")
793  urlUpdateObj.size = size
794  urlUpdateObj.linksI = internalLinksCount
795  urlUpdateObj.linksE = externalLinksCount
796  urlUpdateObj.freq = freq
797  urlUpdateObj.rawContentMd5 = contentMd5
798  self.dbWrapper.urlUpdate(urlUpdateObj)
799 
800 
Here is the call graph for this function:

◆ updateCollectTimeAndMime()

def dc_crawler.URLProcess.URLProcess.updateCollectTimeAndMime (   self,
  detectedMime,
  batchItem,
  crawledTime,
  autoDetectMime,
  httpHeaders = None,
  strContent = None 
)

Definition at line 714 of file URLProcess.py.

714  strContent=None):
715  if self.dbWrapper is not None:
716  self.checkFieldsIsNone(["dbWrapper", "siteId"])
717  if crawledTime is not None:
718  collectTime = int((time.time() - crawledTime) * 1000)
719  else:
720  collectTime = 0
721  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, batchItem.urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
722  normalizeMask=self.normMask)
723  if strContent is not None:
724  urlUpdateObj.rawContentMd5 = hashlib.md5(strContent).hexdigest()
725  urlUpdateObj.crawlingTime = SQLExpression(("`CrawlingTime` + %s" % str(collectTime)))
726  urlUpdateObj.totalTime = SQLExpression(("`TotalTime` + %s" % str(collectTime)))
727  urlUpdateObj.tcDate = SQLExpression("NOW()")
728  urlUpdateObj.UDate = SQLExpression("NOW()")
729  logger.debug(">>> detectMime = " + str(detectedMime))
730  if httpHeaders is not None:
731  for header in httpHeaders:
732  if header.lower() == "etag":
733  # Simple get only first from several
734  urlUpdateObj.eTag = httpHeaders[header].split(',')[0].strip("\"'")
735  if detectedMime is not None and autoDetectMime is not None:
736  urlUpdateObj.contentType = str(detectedMime)
737  self.dbWrapper.urlUpdate(urlUpdateObj)
738 
739 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ updateCrawledURL()

def dc_crawler.URLProcess.URLProcess.updateCrawledURL (   self,
  crawledResource,
  batchItem,
  contentSize,
  status = dc.EventObjects.URL.STATUS_CRAWLED 
)

Definition at line 448 of file URLProcess.py.

448  def updateCrawledURL(self, crawledResource, batchItem, contentSize, status=dc.EventObjects.URL.STATUS_CRAWLED):
449 
450  if self.dbWrapper is not None:
451  self.checkFieldsIsNone(["urlObj", "dbWrapper", "siteId"])
452  logger.debug(">>> Start urls update")
453 
454  updatedCount = self.urlObj.mRate * self.urlObj.mRateCounter
455  if crawledResource.http_code != 304:
456  updatedCount += 1
457  mrate = updatedCount / (self.urlObj.mRateCounter + 1)
458 
459 
460  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, batchItem.urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
461  normalizeMask=self.normMask)
462 
463  urlUpdateObj.contentType = batchItem.urlObj.contentType
464  urlUpdateObj.charset = batchItem.urlObj.charset
465  urlUpdateObj.errorMask = batchItem.urlObj.errorMask
466  urlUpdateObj.crawlingTime = batchItem.urlObj.crawlingTime
467  urlUpdateObj.totalTime = batchItem.urlObj.crawlingTime
468  urlUpdateObj.httpCode = batchItem.urlObj.httpCode
469 
470  urlUpdateObj.status = batchItem.urlObj.status = status
471  urlUpdateObj.size = batchItem.urlObj.size = contentSize
472  urlUpdateObj.mRate = batchItem.urlObj.mRate = mrate
473 
474  batchItem.urlObj.UDate = batchItem.urlObj.tcDate = str(datetime.datetime.now())
475  urlUpdateObj.UDate = urlUpdateObj.tcDate = SQLExpression("NOW()")
476  batchItem.urlObj.mRateCounter += 1
477  urlUpdateObj.mRateCounter = SQLExpression("`MRateCounter` + 1")
478  urlUpdateObj.lastModified = batchItem.urlObj.lastModified = crawledResource.last_modified
479  urlUpdateObj.urlMd5 = batchItem.urlObj.urlMd5
480 
481  if APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER in self.siteProperties:
482  # Evaluate URL class values if neccessary
483  changedFieldsDict = FieldsSQLExpressionEvaluator.execute(self.siteProperties, self.dbWrapper, None,
484  batchItem.urlObj, logger,
485  APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER)
486  # Update URL values
487  if changedFieldsDict is not None:
488  for name, value in changedFieldsDict.items():
489  if hasattr(urlUpdateObj, name):
490  setattr(urlUpdateObj, name, value)
491 
492  logger.debug("!!! Before self.dbWrapper.urlUpdate(urlUpdateObj, \"`Status` = 3\")")
493  affectDB = self.dbWrapper.affect_db
494  self.dbWrapper.affect_db = True
495  updatedRowsCount = self.dbWrapper.urlUpdate(urlUpdateObj, "`Status` = 3")
496  self.dbWrapper.affect_db = affectDB
497  logger.debug("!!! updatedRowsCount = " + str(updatedRowsCount))
498 
499 
Here is the call graph for this function:

◆ updateTypeForURLObjects()

def dc_crawler.URLProcess.URLProcess.updateTypeForURLObjects (   self,
  urlObjects,
  typeArg = dc.EventObjects.URL.TYPE_CHAIN 
)

Definition at line 855 of file URLProcess.py.

855  def updateTypeForURLObjects(self, urlObjects, typeArg=dc.EventObjects.URL.TYPE_CHAIN):
856  if self.dbWrapper is not None:
857  updateUrlObjects = []
858  for urlObject in urlObjects:
859  localUrlObject = dc.EventObjects.URLUpdate(urlObject.siteId, urlObject.url, normalizeMask=self.normMask)
860  localUrlObject.urlMd5 = urlObject.urlMd5
861  localUrlObject.type = typeArg
862  updateUrlObjects.append(localUrlObject)
863  if len(updateUrlObjects) > 0:
864  self.dbWrapper.urlUpdate(updateUrlObjects)
865 
866 
867 

◆ updateURL()

def dc_crawler.URLProcess.URLProcess.updateURL (   self,
  batchItem,
  batchId,
  status = dc.EventObjects.URL.STATUS_CRAWLING 
)

Definition at line 505 of file URLProcess.py.

505  def updateURL(self, batchItem, batchId, status=dc.EventObjects.URL.STATUS_CRAWLING):
506 
507  if self.dbWrapper is not None:
508  self.checkFieldsIsNone(["urlObj", "dbWrapper", "siteId"])
509  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, batchItem.urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
510  normalizeMask=self.normMask)
511  urlUpdateObj.batchId = batchId
512  if not self.urlObj.httpMethod:
513  urlUpdateObj.httpMethod = batchItem.urlObj.httpMethod = "get"
514  else:
515  urlUpdateObj.httpMethod = batchItem.urlObj.httpMethod = self.urlObj.httpMethod
516 
517  urlUpdateObj.status = batchItem.urlObj.status = status
518  batchItem.urlObj.crawled += 1
519  urlUpdateObj.crawled = SQLExpression("`Crawled`+1")
520  urlUpdateObj.tcDate = batchItem.urlObj.tcDate = SQLExpression("NOW()")
521  urlUpdateObj.UDate = batchItem.urlObj.UDate = SQLExpression("NOW()")
522 
523  if status == dc.EventObjects.URL.STATUS_CRAWLING:
524 
525  urlUpdateObj.contentType = batchItem.urlObj.contentType = dc.EventObjects.URL.CONTENT_TYPE_UNDEFINED
526 
527  urlUpdateObj.httpCode = batchItem.urlObj.httpCode = 0
528 
529  updatedRowsCount = self.dbWrapper.urlUpdate(urlUpdateObj)
530  logger.debug("!!! updatedRowsCount = " + str(updatedRowsCount))
531 
532 
def updateURL(input_url, site)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ updateURLFields()

def dc_crawler.URLProcess.URLProcess.updateURLFields (   self,
  urlMd5,
  wrapper,
  siteId 
)

Definition at line 242 of file URLProcess.py.

242  def updateURLFields(self, urlMd5, wrapper, siteId):
243  urlUpdateObj = dc.EventObjects.URLUpdate(siteId, urlMd5, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
244  normalizeMask=self.normMask)
245  if self.siteProperties is None:
246  self.siteProperties = {}
247 
248  logger.debug('!!!!!! updateURLFields !!! self.siteProperties: ' + str(self.siteProperties))
249  # Status field
250  if "RECRAWL_URL_UPDATE_STATUS" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE_STATUS"] != "-1":
251  urlUpdateObj.status = int(self.siteProperties["RECRAWL_URL_UPDATE_STATUS"])
252  else:
253  if "RECRAWL_URL_UPDATE_STATUS" in self.siteProperties and \
254  self.siteProperties["RECRAWL_URL_UPDATE_STATUS"] == "-1":
255  urlUpdateObj.status = None
256  else:
257  urlUpdateObj.status = dc.EventObjects.URL.STATUS_NEW
258 
259  # TcDate field
260  if "RECRAWL_URL_UPDATE_TCDATE" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE_TCDATE"] != "":
261  urlUpdateObj.tcDate = self.siteProperties["RECRAWL_URL_UPDATE_TCDATE"]
262  else:
263  if "RECRAWL_URL_UPDATE_TCDATE" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE_TCDATE"] == "":
264  urlUpdateObj.tcDate = None
265  else:
266  urlUpdateObj.tcDate = SQLExpression("NOW()")
267 
268  # CDate field
269  if "RECRAWL_URL_UPDATE_CDATE" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE_CDATE"] != "":
270  urlUpdateObj.CDate = self.siteProperties["RECRAWL_URL_UPDATE_CDATE"]
271 
272  # UDate field
273  if "RECRAWL_URL_UPDATE_UDATE" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE_UDATE"] != "":
274  urlUpdateObj.UDate = self.siteProperties["RECRAWL_URL_UPDATE_UDATE"]
275  else:
276  if "RECRAWL_URL_UPDATE_UDATE" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE_UDATE"] == "":
277  urlUpdateObj.UDate = None
278  else:
279  urlUpdateObj.UDate = SQLExpression("NOW()")
280 
281  # Recrawl url update
282  if "RECRAWL_URL_UPDATE" in self.siteProperties and self.siteProperties["RECRAWL_URL_UPDATE"] != "":
283  self.recrawlUrlUpdateHandler(wrapper, self.siteProperties["RECRAWL_URL_UPDATE"], urlUpdateObj)
284 
285  if wrapper is not None:
286  saveAffectDB = wrapper.affect_db
287  wrapper.affect_db = True
288  wrapper.urlUpdate(urlUpdateObj, "`State`=0")
289  wrapper.affect_db = saveAffectDB
290 
291 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ updateURLForFailed()

def dc_crawler.URLProcess.URLProcess.updateURLForFailed (   self,
  errorBit,
  batchItem,
  httpCode = CONSTS.HTTP_CODE_400,
  status = dc.EventObjects.URL.STATUS_CRAWLED,
  updateUdate = True 
)

Definition at line 375 of file URLProcess.py.

375  status=dc.EventObjects.URL.STATUS_CRAWLED, updateUdate=True):
376  if self.dbWrapper is not None:
377  self.checkFieldsIsNone(["dbWrapper", "siteId"])
378  logger.debug("Set errorBit = " + str(errorBit) + ", httpCode = " + str(httpCode))
379  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, batchItem.urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
380  normalizeMask=self.normMask)
381 
382  batchItem.urlObj.errorMask = batchItem.urlObj.errorMask | errorBit
383  urlUpdateObj.errorMask = SQLExpression("`ErrorMask` | " + str(errorBit))
384 
385  urlUpdateObj.status = batchItem.urlObj.status = status
386  urlUpdateObj.tcDate = batchItem.urlObj.tcDate = SQLExpression("NOW()")
387  if updateUdate:
388  urlUpdateObj.UDate = batchItem.urlObj.UDate = SQLExpression("NOW()")
389 
390  if httpCode is not None:
391  urlUpdateObj.httpCode = batchItem.urlObj.httpCode = httpCode
392  self.urlObj.httpCode = httpCode # #???
393 
394  if self.dbWrapper is not None:
395  # Evaluate URL class values if neccessary
396  changedFieldsDict = FieldsSQLExpressionEvaluator.execute(self.siteProperties, self.dbWrapper, None,
397  batchItem.urlObj, logger,
398  APP_CONSTS.SQL_EXPRESSION_FIELDS_UPDATE_CRAWLER)
399  # Update URL values
400  for name, value in changedFieldsDict.items():
401  if hasattr(urlUpdateObj, name):
402  setattr(urlUpdateObj, name, value)
403  urlUpdateObj.errorMask = SQLExpression("`ErrorMask` | " + str(errorBit))
404 
405  # Update URL data in DB
406  self.dbWrapper.urlUpdate(urlUpdateObj)
407 
408 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ updateURLStatus()

def dc_crawler.URLProcess.URLProcess.updateURLStatus (   self,
  urlId,
  status = dc.EventObjects.URL.STATUS_CRAWLED 
)

Definition at line 537 of file URLProcess.py.

537  def updateURLStatus(self, urlId, status=dc.EventObjects.URL.STATUS_CRAWLED):
538  if status is not None and self.dbWrapper is not None:
539  self.checkFieldsIsNone(["siteId"])
540  urlUpdateObj = dc.EventObjects.URLUpdate(self.siteId, urlId, dc.EventObjects.URLStatus.URL_TYPE_MD5, \
541  normalizeMask=self.normMask)
542  urlUpdateObj.status = status
543  updatedRowsCount = self.dbWrapper.urlUpdate(urlUpdateObj)
544  logger.debug("!!! updatedRowsCount = " + str(updatedRowsCount))
545 
546 
Here is the call graph for this function:

◆ urlDBSync()

def dc_crawler.URLProcess.URLProcess.urlDBSync (   self,
  batchItem,
  crawlerType,
  recrawlPeriod,
  autoRemoveProps 
)

Definition at line 746 of file URLProcess.py.

746  def urlDBSync(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps):
747  if self.dbWrapper is not None:
748  self.checkFieldsIsNone(["dbWrapper", "siteId"])
749  self.isUpdateCollection = False
750  # Request for check exists url on here host
751  sqlQuery = "SELECT COUNT(*) FROM `%s` WHERE `URLMd5` = '%s'" % \
752  (DB_CONSTS.DC_URLS_TABLE_NAME_TEMPLATE % self.siteId, batchItem.urlId)
753  logger.debug("!!! urlDBSync sqlQuery: " + str(sqlQuery))
754 
755  result = self.dbWrapper.customRequest(sqlQuery, CrawlerTask.DB_URLS)
756  logger.debug("!!! urlDBSync result: " + varDump(result))
757 
758  isExist = False
759  if result is not None and len(result) > 0 and len(result[0]) > 0:
760  logger.debug("!!! urlDBSync result[0][0]: " + str(result[0][0]) + " type: " + str(type(result[0][0])))
761  isExist = bool(int(result[0][0]) > 0)
762 
763  try:
764  if isExist:
765  logger.debug("Url already exist in DB.")
766  else:
767  # When url come from another dc cluster's host it is not present in the db
768  if self.addURLFromBatchToDB(batchItem, crawlerType, recrawlPeriod, autoRemoveProps):
769  self.urlDBSync(batchItem, crawlerType, recrawlPeriod, autoRemoveProps)
770  else:
771  msg = "Can't add url from batch."
772  logger.debug(msg)
773  raise SyncronizeException(msg)
774  except SyncronizeException, err:
775  logger.debug("Can't synchronize url with db: " + str(err))
776  raise err
777 
778 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ urlTemplateApply()

def dc_crawler.URLProcess.URLProcess.urlTemplateApply (   self,
  url,
  crawlerType,
  urlTempalteRegular,
  urlTempalteRealtime,
  urlTempalteRegularEncode,
  urlTempalteRealtimeEncode 
)

Definition at line 986 of file URLProcess.py.

986  urlTempalteRealtimeEncode):
987  ret = url
988  if crawlerType == dc.EventObjects.Batch.TYPE_REAL_TIME_CRAWLER:
989  if urlTempalteRealtime is not None:
990  try:
991  if urlTempalteRealtimeEncode is not None and bool(int(urlTempalteRealtimeEncode)):
992  encodedUrl = urllib.quote(url)
993  else:
994  encodedUrl = url
995  except ValueError:
996  encodedUrl = url
997  ret = urlTempalteRealtime.replace(self.URL_TEMPLATE_CONST, encodedUrl)
998  else:
999  if urlTempalteRegular is not None:
1000  try:
1001  if urlTempalteRegularEncode is not None and bool(int(urlTempalteRegularEncode)):
1002  encodedUrl = urllib.quote(url)
1003  else:
1004  encodedUrl = url
1005  except ValueError:
1006  encodedUrl = url
1007  ret = urlTempalteRegular.replace(self.URL_TEMPLATE_CONST, encodedUrl)
1008  if ret != url:
1009  logger.debug(">>> url was replaced ")
1010  logger.debug(">>> new url = " + ret)
1011  return ret
1012 
1013 
Here is the caller graph for this function:

Member Data Documentation

◆ dbWrapper

dc_crawler.URLProcess.URLProcess.dbWrapper

Definition at line 62 of file URLProcess.py.

◆ DC_URLS_TABLE_PREFIX

string dc_crawler.URLProcess.URLProcess.DC_URLS_TABLE_PREFIX = "urls_"
static

Definition at line 50 of file URLProcess.py.

◆ DEFAULT_PROTOCOLS

list dc_crawler.URLProcess.URLProcess.DEFAULT_PROTOCOLS = ["http", "https"]
static

Definition at line 55 of file URLProcess.py.

◆ DETECT_MIME_TIMEOUT

int dc_crawler.URLProcess.URLProcess.DETECT_MIME_TIMEOUT = 1
static

Definition at line 51 of file URLProcess.py.

◆ isUpdateCollection

dc_crawler.URLProcess.URLProcess.isUpdateCollection

Definition at line 59 of file URLProcess.py.

◆ normMask

dc_crawler.URLProcess.URLProcess.normMask

Definition at line 69 of file URLProcess.py.

◆ PATTERN_WITH_PROTOCOL

dc_crawler.URLProcess.URLProcess.PATTERN_WITH_PROTOCOL = re.compile('[a-zA-Z]+:(//)?')
static

Definition at line 52 of file URLProcess.py.

◆ PROTOCOL_PREFIX

string dc_crawler.URLProcess.URLProcess.PROTOCOL_PREFIX = "://"
static

Definition at line 54 of file URLProcess.py.

◆ protocolsList

dc_crawler.URLProcess.URLProcess.protocolsList

Definition at line 66 of file URLProcess.py.

◆ site

dc_crawler.URLProcess.URLProcess.site

Definition at line 64 of file URLProcess.py.

◆ siteId

dc_crawler.URLProcess.URLProcess.siteId

Definition at line 63 of file URLProcess.py.

◆ siteProperties

dc_crawler.URLProcess.URLProcess.siteProperties

Definition at line 67 of file URLProcess.py.

◆ url

dc_crawler.URLProcess.URLProcess.url

Definition at line 61 of file URLProcess.py.

◆ URL_TEMPLATE_CONST

string dc_crawler.URLProcess.URLProcess.URL_TEMPLATE_CONST = "%URL%"
static

Definition at line 53 of file URLProcess.py.

◆ urlObj

dc_crawler.URLProcess.URLProcess.urlObj

Definition at line 60 of file URLProcess.py.

◆ urlTable

dc_crawler.URLProcess.URLProcess.urlTable

Definition at line 65 of file URLProcess.py.


The documentation for this class was generated from the following file: