HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.CollectURLs.CollectURLs Class Reference
Inheritance diagram for dc_crawler.CollectURLs.CollectURLs:
Collaboration diagram for dc_crawler.CollectURLs.CollectURLs:

Public Member Functions

def __init__ (self, isAbortedByTTL=None)
 
def checkFieldsIsNone (self)
 
def process (self, httpCode, readOnly=False, httpApplyHeaders=None, proxyName=None)
 
def feedElementsProcessing (self, urlMd5, httpCode, elemUrl, localSiteId, localUrlObj, localUrl, params, maxURLsFromPage, rootFeed=False)
 
def processProcessor (self, urlSet, dom, urlXpathList, urlObj)
 
def evaluateDateMacro (self, localPattern, dateFromat)
 
def extractFormURL (self, dom, siteProperties)
 
def getFieldParams (self, formFields, postForms, siteId)
 
def insertNewSiteProperties (self, params, wrapper, siteId)
 
def feedparserParseDateFixes (self, aDateString)
 

Static Public Member Functions

def filtersApply (inputFilters, subject, depth, wrapper, siteId, fields=None, opCode=Filters.OC_RE, stage=Filters.STAGE_COLLECT_URLS, selectSubject=None, defaultValue=False)
 

Public Attributes

 crawledResource
 
 url
 
 dom
 
 realUrl
 
 baseUrl
 
 processorName
 
 batchItem
 
 urlXpathList
 
 feedItems
 
 feed
 
 siteProperties
 
 site
 
 dbWrapper
 
 autoRemoveProps
 
 autoDetectMime
 
 processContentTypes
 
 postForms
 
 urlProcess
 
 robotsParser
 
 urlsXpathList
 
 isAbortedByTTL
 

Static Public Attributes

 BINARY_CONTENT_TYPE_PATTERN = re.compile('(text)|(xml)', re.I)
 
string COLLECT_POST_DATA_NAME = "COLLECT_POST_DATA"
 
string COLLECT_POST_DATA = "1"
 
string DC_URLS_TABLE_PREFIX = "urls_"
 
 PATTERN_WITH_PROTOCOL = re.compile('[a-zA-Z]+:(//)?')
 
string DETECT_MIME_MAIN_CONTENT = "1"
 
string DETECT_MIME_COLLECTED_URL = "2"
 
int DETECT_MIME_TIMEOUT = 1
 

Private Member Functions

def _normalize_attributes (self, kv)
 

Detailed Description

Definition at line 38 of file CollectURLs.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.CollectURLs.CollectURLs.__init__ (   self,
  isAbortedByTTL = None 
)

Definition at line 50 of file CollectURLs.py.

50  def __init__(self, isAbortedByTTL=None):
51  self.crawledResource = None
52  self.url = None
53  self.dom = None
54  self.realUrl = None
55  self.baseUrl = None
56  self.processorName = None
57  self.batchItem = None
58  self.urlXpathList = None
59  self.feedItems = None
60  self.feed = None
61  self.siteProperties = None
62  self.site = None
63  self.dbWrapper = None
64  self.autoRemoveProps = None
65  self.autoDetectMime = None
66  self.processContentTypes = []
67  self.postForms = None
68  self.urlProcess = None
69  self.robotsParser = None
70  self.urlsXpathList = []
71  self.isAbortedByTTL = (lambda: False) if isAbortedByTTL is None else isAbortedByTTL
72 
73 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ _normalize_attributes()

def dc_crawler.CollectURLs.CollectURLs._normalize_attributes (   self,
  kv 
)
private

Definition at line 730 of file CollectURLs.py.

730  def _normalize_attributes(self, kv):
731  return (kv[0], kv[1])
732 
Here is the caller graph for this function:

◆ checkFieldsIsNone()

def dc_crawler.CollectURLs.CollectURLs.checkFieldsIsNone (   self)

Definition at line 76 of file CollectURLs.py.

76  def checkFieldsIsNone(self):
77  excludeList = ['feedItems', 'feed', 'processorName', 'autoDetectMime', 'processContentTypes',
78  'postForms', 'robotsParser', 'dom', 'dbWrapper']
79  for field in self.__dict__:
80  if field not in excludeList and (not hasattr(self, field) or getattr(self, field) is None):
81  msg = "Mandatory field must be initialized, field Name = " + field
82  logger.error(msg)
83  raise Exception(msg)
84 
85 
Here is the caller graph for this function:

◆ evaluateDateMacro()

def dc_crawler.CollectURLs.CollectURLs.evaluateDateMacro (   self,
  localPattern,
  dateFromat 
)

Definition at line 565 of file CollectURLs.py.

565  def evaluateDateMacro(self, localPattern, dateFromat):
566  import time
567  import datetime
568  try:
569  d = {'DATE':'', 'SHORTYEAR':'y', 'YEAR':'Y', 'MONTH':'m', 'DAY':'d', 'HOUR':'H', 'MINUTE':'M', 'SECOND':'S'}
570  regex = re.compile("%@(SHORTYEAR|YEAR|MONTH|DAY|HOUR|MINUTE|SECOND|DATE)\\(([\\+|\\-]\\d{1,2})\\)%")
571  matchArray = regex.findall(localPattern)
572  for i in matchArray:
573  if i[0] == 'DATE':
574  f = dateFromat
575  else:
576  f = '%' + d[i[0]]
577  t = time.strftime(f, time.gmtime(time.time() + datetime.timedelta(hours=(+int(i[1]))).seconds))
578  localPattern = localPattern.replace("%@" + i[0] + "(" + i[1] + ")%", t)
579  except Exception, err:
580  logger.error(str(err))
581 
582  return localPattern
583 
584 
Here is the caller graph for this function:

◆ extractFormURL()

def dc_crawler.CollectURLs.CollectURLs.extractFormURL (   self,
  dom,
  siteProperties 
)

Definition at line 591 of file CollectURLs.py.

591  def extractFormURL(self, dom, siteProperties):
592  formUrls, formMethods, formFields = [], {}, {}
593  if self.COLLECT_POST_DATA_NAME in siteProperties and \
594  siteProperties['COLLECT_POST_DATA'] == self.COLLECT_POST_DATA:
595  for form in dom.xpath("//form"):
596  formAction = None
597  formMethod = 'get'
598  for attr in form.keys():
599  if attr.lower() == "action":
600  formAction = form.get(attr)
601  formUrls.append(formAction)
602  elif attr.lower() == "method":
603  formMethod = form.get(attr)
604  if not formAction:
605  continue
606  formMethods[formAction] = formMethod
607  for field in form.getchildren():
608  tagName, tagValue = None, ""
609  for fieldTag in field.keys():
610  if fieldTag.lower() == "name":
611  tagName = field.get(fieldTag)
612  elif fieldTag.lower() == "value":
613  tagValue = field.get(fieldTag)
614  if tagName:
615  formFields[tagName] = tagValue
616  logger.info("extracted form data, formUrls:%s, formMethods:%s, formFields:%s", \
617  formUrls, formMethods, formFields)
618  return formUrls, formMethods, formFields
619 
620 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ feedElementsProcessing()

def dc_crawler.CollectURLs.CollectURLs.feedElementsProcessing (   self,
  urlMd5,
  httpCode,
  elemUrl,
  localSiteId,
  localUrlObj,
  localUrl,
  params,
  maxURLsFromPage,
  rootFeed = False 
)

Definition at line 441 of file CollectURLs.py.

441  maxURLsFromPage, rootFeed=False):
442 
443  if maxURLsFromPage > 0 and len(self.feedItems) >= maxURLsFromPage:
444  logger.debug("Site maxURLsFromPage = %s limit reached on %s number.",
445  str(maxURLsFromPage), str(len(self.feedItems)))
446  else:
447  if self.feed is not None:
448  self.urlProcess.url = elemUrl
449  self.urlProcess.siteId = localSiteId
450  self.urlProcess.urlObj = localUrlObj
451  localRet = self.urlProcess.fillRssFieldInUrlObj(localUrl, self.url.url, self.batchItem, self.processorName,
452  self.feed, rootFeed)
453  self.urlProcess.urlObj = None
454  if localRet is not None:
455  localRet["urlMd5"] = urlMd5
456  if localRet["urlObj"] is not None:
457  localRet["urlObj"].httpCode = httpCode
458  localRet["urlObj"].processingDelay = 0
459  localRet["urlObj"].parentMd5 = self.url.urlMd5
460 
461  # logger.debug("localRet = %s", str(dict(localRet)))
462 
463  params.append(localRet["urlObj"])
464  self.feedItems.append(localRet)
465  else:
466  logger.debug("self.feed is None!")
467 
Here is the caller graph for this function:

◆ feedparserParseDateFixes()

def dc_crawler.CollectURLs.CollectURLs.feedparserParseDateFixes (   self,
  aDateString 
)

Definition at line 715 of file CollectURLs.py.

715  def feedparserParseDateFixes(self, aDateString):
716  ret = None
717  ds = aDateString
718 
719  # Assumes that date format broken and contains the semicolon ":" in TZ like: "Wed, 19 Aug 2015 08:45:53 +01:00"
720  parts = ds.split(' ')
721  if ("+" in parts[len(parts) - 1] or "-" in parts[len(parts) - 1]) and ":" in parts[len(parts) - 1]:
722  parts[len(parts) - 1] = parts[len(parts) - 1].replace(":", "")
723  ds = " ".join(parts)
724  # ret = feedparser._parse_date_rfc822(ds)
725  ret = feedparser._parse_date(ds) # pylint: disable=W0212
726 
727  return ret
728 
729 
Definition: join.py:1
Here is the caller graph for this function:

◆ filtersApply()

def dc_crawler.CollectURLs.CollectURLs.filtersApply (   inputFilters,
  subject,
  depth,
  wrapper,
  siteId,
  fields = None,
  opCode = Filters.OC_RE,
  stage = Filters.STAGE_COLLECT_URLS,
  selectSubject = None,
  defaultValue = False 
)
static

Definition at line 636 of file CollectURLs.py.

636  stage=Filters.STAGE_COLLECT_URLS, selectSubject=None, defaultValue=False):
637  ret = defaultValue
638  fValue = Utils.generateReplacementDict()
639  fValue.update({"MAX_DEPTH": str(depth)})
640 
641  if inputFilters is not None:
642  for inputFilter in inputFilters:
643  if inputFilter.stage == Filters.STAGE_ALL or inputFilter.stage == Filters.STAGE_REDIRECT_URL:
644  inputFilter.stage = Filters.STAGE_COLLECT_URLS
645 
646 # logger.debug(">>> Filters() (2.1) fields: " + varDump(fields) + " inputFilters: " + varDump(inputFilters))
647  localFilters = Filters(filters=inputFilters, dbTaskWrapper=wrapper, siteId=siteId, readMode=0, fields=fields,
648  opCode=opCode, stage=stage, selectSubject=selectSubject)
649 
650  # logger.debug(">>> before filter include = " + subject[:255] + ' . . . ')
651  fResult = localFilters.filterAll(stage, fValue, Filters.LOGIC_OR, subject, 1)
652  logger.debug(">>> filter result include - " + str(fResult))
653  for elem in fResult:
654 # logger.debug('elem = ' + str(elem) + ' type: ' + str(type(elem)))
655  if elem > 0:
656  ret = True
657  break
658 
659  if ret is True:
660  # logger.debug(">>> before filter exclude = " + subject[:255] + ' . . . ')
661  fResult = localFilters.filterAll(stage, fValue, Filters.LOGIC_OR, subject, -1)
662  logger.debug(">>> filter result exclude - " + str(fResult))
663  for elem in fResult:
664 # logger.debug('elem = ' + str(elem) + ' type: ' + str(type(elem)))
665  if elem > 0:
666  ret = False
667  break
668 
669  logger.debug("Verdict: " + str(ret))
670  return ret
671 
672 
Here is the caller graph for this function:

◆ getFieldParams()

def dc_crawler.CollectURLs.CollectURLs.getFieldParams (   self,
  formFields,
  postForms,
  siteId 
)

Definition at line 678 of file CollectURLs.py.

678  def getFieldParams(self, formFields, postForms, siteId):
679  ret = []
680  for fieldName, fieldValue in formFields.iteritems():
681  if fieldName in postForms:
682  continue
683  logger.debug("field_name: %s", fieldName)
684  logger.debug("field_value: %s", fieldValue)
685  ret.append((siteId, "HTTP_POST_FORM_" + fieldName, fieldValue))
686  return ret
687 
688 
Here is the caller graph for this function:

◆ insertNewSiteProperties()

def dc_crawler.CollectURLs.CollectURLs.insertNewSiteProperties (   self,
  params,
  wrapper,
  siteId 
)

Definition at line 694 of file CollectURLs.py.

694  def insertNewSiteProperties(self, params, wrapper, siteId):
695  if siteId is not None and hasattr(params, '__iter__') and len(params) > 0:
696  localSiteUpdate = dc.EventObjects.SiteUpdate(siteId)
697  for attr in localSiteUpdate.__dict__:
698  if hasattr(localSiteUpdate, attr):
699  setattr(localSiteUpdate, attr, None)
700  localSiteUpdate.updateType = dc.EventObjects.SiteUpdate.UPDATE_TYPE_APPEND
701  localSiteUpdate.id = siteId
702  localSiteUpdate.properties = []
703  for param in params:
704  newPropElem = {}
705  newPropElem["siteId"] = param[0]
706  newPropElem["name"] = param[1]
707  newPropElem["value"] = param[2]
708  localSiteUpdate.properties.append(newPropElem)
709  wrapper.siteNewOrUpdate(localSiteUpdate, stype=dc.EventObjects.SiteUpdate)
710 
711 
Here is the caller graph for this function:

◆ process()

def dc_crawler.CollectURLs.CollectURLs.process (   self,
  httpCode,
  readOnly = False,
  httpApplyHeaders = None,
  proxyName = None 
)

Definition at line 92 of file CollectURLs.py.

92  def process(self, httpCode, readOnly=False, httpApplyHeaders=None, proxyName=None):
93 
94  self.checkFieldsIsNone()
95  if self.siteProperties is None:
96  self.siteProperties = {}
97  if self.processContentTypes is None:
98  self.processContentTypes = []
99  localSiteId = self.batchItem.siteId if self.batchItem.siteId else "0"
100  nextStep = True
101  useChains = False
102  internalLinks, externalLinks = [], []
103  maxURLsFromPage = 0
104  params = []
105  chainUrls = []
106  formUrls = None
107  formMethods = None
108  formFields = None
109  urlSet = set()
110 
111  logger.debug("!!! self.site.maxURLsFromPage = " + str(self.site.maxURLsFromPage))
112  logger.debug("!!! self.url.maxURLsFromPage = " + str(self.url.maxURLsFromPage))
113 
114  if self.site is not None and self.site.maxURLsFromPage is not None:
115  maxURLsFromPage = self.site.maxURLsFromPage
116 
117  if self.url is not None and self.url.maxURLsFromPage is not None and self.url.maxURLsFromPage > 0:
118  maxURLsFromPage = self.url.maxURLsFromPage
119 
120  if nextStep and self.crawledResource is not None and \
121  not self.BINARY_CONTENT_TYPE_PATTERN.search(self.crawledResource.content_type):
122  nextStep = False
123 
124  # don't parse url for 4XX or 5XX response
125  if nextStep and self.crawledResource is not None:
126  code_type = int(self.crawledResource.http_code) / 100
127  if code_type == 4 or code_type == 5:
128  nextStep = False
129 
130  if nextStep and self.crawledResource is not None and not self.crawledResource.html_content:
131  nextStep = False
132 
133  # if nextStep and self.dom is None:
134  # logger.debug("DOM is None")
135  # nextStep = False
136 
137  if nextStep:
138  useChains = True
139  if self.dom is not None:
140  self.processProcessor(urlSet, self.dom, self.urlXpathList, self.batchItem.urlObj)
141  formUrls, formMethods, formFields = self.extractFormURL(self.dom, self.siteProperties)
142  urlSet.update(formUrls)
143  else:
144  logger.debug("DOM is None")
145 
146  if self.url.type == dc.EventObjects.URL.TYPE_SINGLE:
147  logger.debug("URL type: single")
148  nextStep = False
149 
150  if nextStep and self.crawledResource.dynamic_fetcher_result_type == \
151  SeleniumFetcher.MACRO_RESULT_TYPE_URLS_LIST:
152  ul = None
153  try:
154  ul = json.loads(self.crawledResource.html_content)
155  except Exception, err:
156  logger.error("Error deserialize macro data from result string: %s\n%s", str(err),
157  self.crawledResource.html_content)
158  if ul is not None:
159  logger.debug("Fill urlSet from macro results: %s items", str(len(ul)))
160  if isinstance(ul, list):
161  urlSet.update([u for u in ul if isinstance(u, basestring) and u != ''])
162 
163  if nextStep:
164  if self.url.type == dc.EventObjects.URL.TYPE_CHAIN:
165  logger.debug("URL type: chain")
166  nextStep = False
167 
168  if nextStep:
169  # (3) END
170  urlTable = self.DC_URLS_TABLE_PREFIX + localSiteId
171  self.urlProcess.urlTable = urlTable
172  try:
173  if self.siteProperties is not None and "RSS_FEED_ZERO_ITEM" in self.siteProperties and \
174  int(self.siteProperties["RSS_FEED_ZERO_ITEM"]) == 1:
175  if self.processorName == PCONSTS.PROCESSOR_FEED_PARSER or self.processorName == PCONSTS.PROCESSOR_RSS:
176  self.feedElementsProcessing(self.url.urlMd5, httpCode, self.url.url, localSiteId, self.url, self.url.url,
177  params, maxURLsFromPage, True)
178  except ValueError:
179  logger.debug(">>> Wrong \"RSS_FEED_ZERO_ITEM\" property's value")
180 
181  logger.debug("URLs candidates collected %s items:\n%s", str(len(urlSet)), str(urlSet))
182 
183  if self.site.maxURLs > 0 and len(urlSet) >= self.site.maxURLs:
184  urlSet = set(list(urlSet)[:self.site.maxURLs])
185  logger.debug("Site maxURLs = %s limit reached.", str(self.site.maxURLs))
186 
187  if self.site.maxResources > 0 and len(urlSet) >= self.site.maxResources:
188  urlSet = set(list(urlSet)[:self.site.maxResources])
189  logger.debug("Site maxResources = %s limit reached.", str(self.site.maxResources))
190 
191  countCnt = 0
192  countErrors = 0
193  for elemUrl in urlSet:
194 
195  if self.isAbortedByTTL():
196  logger.debug("Aborted by TTL. All elements skipped.")
197  break
198 
199  if elemUrl is None:
200  logger.debug("Some url from urlSet is None, skipped.")
201  continue
202 
203  elemUrl = elemUrl.strip()
204  if elemUrl == '':
205  logger.debug("Some url from urlSet is empty, skipped!")
206  continue
207 
208  localUrl = elemUrl
209  self.urlProcess.urlObj = self.url
210  self.urlProcess.url = elemUrl
211  self.urlProcess.dbWrapper = self.dbWrapper
212  self.urlProcess.siteId = localSiteId
213  retUrl, retContinue = self.urlProcess.processURL(self.realUrl, internalLinks, externalLinks, self.filtersApply,
214  None, self.baseUrl)
215  if retUrl is not None:
216  elemUrl = retUrl
217  elemUrl = UrlNormalize.execute(siteProperties=self.siteProperties, base=self.baseUrl, url=elemUrl, supportProtocols=None, log=logger)
218  else:
219  retContinue = True
220 
221  if retContinue:
222  logger.debug("Candidate URL is not passed general checks, skipped: %s", str(elemUrl))
223  continue
224 
225  # Apply filter to Url
226  if not self.filtersApply(inputFilters=self.site.filters,
227  subject=elemUrl,
228  depth=self.url.depth,
229  wrapper=self.dbWrapper,
230  siteId=localSiteId):
231  logger.debug("Candidate URL not matched filters, skipped.")
232  continue
233  else:
234  logger.debug("Candidate URL matched filters.")
235 
236  # Check exist of the Url
237  urlMd5 = hashlib.md5(elemUrl).hexdigest()
238  self.urlProcess.url = elemUrl
239  self.urlProcess.siteId = localSiteId
240  self.urlProcess.urlTable = urlTable
241  if self.urlProcess.isUrlExist(self.site.recrawlPeriod, urlMd5):
242  logger.debug("Candidate URL %s already exist, skipped.", str(urlMd5))
243  continue
244 
245  if self.site.maxURLs > 0:
246  if httpCode == CRAWLER_CONSTS.HTTP_CODE_200:
247  countCnt += 1
248  else:
249  countErrors += 1
250 
251  if self.dbWrapper is not None:
252  currentCnt = self.urlProcess.readCurrentCnt(self.site.maxURLs)
253  if currentCnt >= self.site.maxURLs or countCnt >= self.site.maxURLs or \
254  (countCnt + countErrors) >= self.site.maxURLs:
255  logger.debug("Site MaxURLs: %s limit is reached. countCnt = %s, currentCnt = %s",
256  str(self.site.maxURLs), str(countCnt), str(currentCnt))
257  autoremovedURLs = self.urlProcess.autoRemoveURL(self.autoRemoveProps, self.site.recrawlPeriod, urlTable,
258  self.dbWrapper)
259  if autoremovedURLs == 0:
260  logger.debug("No one URL auto removed, candidate URL skipped!")
261  continue
262  else:
263  logger.debug("%s URLs auto removed.", str(autoremovedURLs))
264 
265  if currentCnt >= self.site.maxResources or countCnt >= self.site.maxResources or \
266  (countCnt + countErrors) >= self.site.maxResources:
267  logger.debug("Site maxResources = %s limit is reached. countCnt = %s, currentCnt = %s",
268  str(self.site.maxResources), str(countCnt), str(currentCnt))
269  autoremovedURLs = self.urlProcess.autoRemoveURL(self.autoRemoveProps, self.site.recrawlPeriod, urlTable,
270  self.dbWrapper)
271  if autoremovedURLs == 0:
272  logger.debug("No one URL auto removed, candidate URL skipped!")
273  continue
274  else:
275  logger.debug("%s URLs auto removed.", str(autoremovedURLs))
276 
277  # detect collected url mime type and ignore non-match URL
278  # (7) Detect collected url mime type and ignore non-match URL
279  detectedMime = ''
280  if self.autoDetectMime == self.DETECT_MIME_COLLECTED_URL and self.processContentTypes is not None:
281  self.urlProcess.url = elemUrl
282  detectedMime = self.urlProcess.detectUrlMime(self.siteProperties["CONTENT_TYPE_MAP"] if \
283  "CONTENT_TYPE_MAP" in self.siteProperties else None)
284  if detectedMime not in self.processContentTypes:
285  logger.debug("Candidate URL MIME type is not matched, skipped!")
286  continue
287  # (7) END
288 
289  if "ROBOTS_COLLECT" not in self.siteProperties or int(self.siteProperties["ROBOTS_COLLECT"]) > 0:
290  logger.debug("Robots.txt obey mode is ON")
291  if self.robotsParser and self.robotsParser.loadRobots(elemUrl, self.batchItem.siteId, httpApplyHeaders,
292  proxyName):
293  isAllowed, retUserAgent = self.robotsParser.checkUrlByRobots(elemUrl, self.batchItem.siteId,
294  httpApplyHeaders)
295  if not isAllowed:
296  logger.debug("URL " + elemUrl + " is NOT Allowed by user-agent:" + str(retUserAgent))
297  self.urlProcess.updateURLForFailed(APP_CONSTS.ERROR_ROBOTS_NOT_ALLOW, self.batchItem)
298  continue
299 
300  self.urlProcess.siteId = localSiteId
301  depth = self.urlProcess.getDepthFromUrl(self.batchItem.urlId)
302 
303  # per project redirects resolving
304  if "HTTP_REDIRECT_RESOLVER" in self.siteProperties and self.siteProperties["HTTP_REDIRECT_RESOLVER"] != "":
305  logger.debug('!!!!!! HTTP_REDIRECT_RESOLVER !!!!! ')
306 
307 
308  if "CONNECTION_TIMEOUT" in self.siteProperties:
309  connectionTimeout = float(self.siteProperties["CONNECTION_TIMEOUT"])
310  else:
311  connectionTimeout = CRAWLER_CONSTS.CONNECTION_TIMEOUT
312 
313  tm = int(self.url.httpTimeout) / 1000.0
314  if isinstance(self.url.httpTimeout, float):
315  tm += float('0' + str(self.url.httpTimeout).strip()[str(self.url.httpTimeout).strip().find('.'):])
316 
317  proxies = {"http": "http://" + proxyName} if proxyName is not None else None
318 
319  auth = None
320  if 'HTTP_AUTH_NAME' in self.siteProperties and 'HTTP_AUTH_PWD' in self.siteProperties:
321  authName = self.siteProperties['HTTP_AUTH_NAME']
322  authPwd = self.siteProperties['HTTP_AUTH_PWD']
323  if authName is not None and authPwd is not None:
324  auth = (authName, authPwd)
325 
326  postForms = {}
327  for key in self.siteProperties.keys():
328  if key.startswith('HTTP_POST_FORM_'):
329  postForms[key[len('HTTP_POST_FORM_'):]] = self.siteProperties[key]
330  postData = self.urlProcess.resolveHTTP(postForms, httpApplyHeaders)
331 
332  maxRedirects = HTTPRedirectResolver.RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS
333  if 'HTTP_REDIRECTS_MAX' in self.siteProperties:
334  maxRedirects = int(self.siteProperties['HTTP_REDIRECTS_MAX'])
335 
336  redirectResolver = HTTPRedirectResolver(propertyString=self.siteProperties["HTTP_REDIRECT_RESOLVER"],
337  fetchType=self.site.fetchType,
338  dbWrapper=self.dbWrapper,
339  siteId=localSiteId,
340  connectionTimeout=connectionTimeout)
341 
342  resUrl = redirectResolver.resolveRedirectUrl(url=elemUrl,
343  headers=httpApplyHeaders,
344  timeout=tm,
345  allowRedirects=True,
346  proxies=proxies,
347  auth=auth,
348  postData=postData,
349  maxRedirects=maxRedirects,
350  filters=self.site.filters)
351 
352  logger.debug("Resolved url: %s", str(resUrl))
353  elemUrl = resUrl
354 
355  if elemUrl is not None:
356  self.urlProcess.url = elemUrl
357  self.urlProcess.siteId = localSiteId
358  self.urlProcess.urlObj = self.url
359  localUrlObj = self.urlProcess.createUrlObjForCollectURLs(urlMd5, formMethods, self.batchItem.urlId, depth,
360  detectedMime, self.site.maxURLsFromPage)
361  # update counters of external and internal links
362  localUrlObj.linksI = len(internalLinks)
363  localUrlObj.linksE = len(externalLinks)
364 
365  if self.processorName == PCONSTS.PROCESSOR_FEED_PARSER or self.processorName == PCONSTS.PROCESSOR_RSS:
366  self.feedElementsProcessing(urlMd5, httpCode, elemUrl, localSiteId, localUrlObj, localUrl, params,
367  maxURLsFromPage)
368  else:
369  params.append(localUrlObj)
370 
371  if useChains and "URL_CHAIN" in self.siteProperties and self.siteProperties["URL_CHAIN"] is not None:
372  localChainDict = json.loads(self.siteProperties["URL_CHAIN"])
373  depth = self.urlProcess.getDepthFromUrl(self.batchItem.urlId)
374  if "url_pattern" in localChainDict:
375  for elemUrl in urlSet:
376  if elemUrl is None:
377  logger.debug("Some url from urlSet is None")
378  continue
379  self.urlProcess.url = elemUrl
380 # retUrl = self.urlProcess.simpleURLCanonize(self.realUrl)
381 # if retUrl is None or not UrlNormalizator.isNormalUrl(retUrl):
382 
383  retUrl = urlNormalization(self.baseUrl, elemUrl)
384  if retUrl is None:
385  logger.debug("Bad url normalization, url: %s", retUrl)
386  continue
387  else:
388  elemUrl = retUrl
389  detectedMime = ''
390  if self.autoDetectMime == self.DETECT_MIME_COLLECTED_URL and self.processContentTypes is not None:
391  self.urlProcess.url = elemUrl
392  detectedMime = self.urlProcess.detectUrlMime(self.siteProperties["CONTENT_TYPE_MAP"] if \
393  "CONTENT_TYPE_MAP" in self.siteProperties else None, \
394  self.batchItem.urlObj)
395  urlMd5 = hashlib.md5(elemUrl).hexdigest()
396  self.urlProcess.url = elemUrl
397  self.urlProcess.siteId = localSiteId
398  self.urlProcess.urlObj = self.url
399  try:
400  localUrlObj = self.urlProcess.\
401  createUrlObjForChain(localChainDict["url_pattern"], urlMd5, formMethods,
402  self.batchItem.urlId, depth, detectedMime, self.site.maxURLsFromPage)
403  if localUrlObj is not None:
404  chainUrls.append(copy.deepcopy(localUrlObj))
405  except Exception as excp:
406  logger.error("Error in URL_CHAIN deserialize, excp = " + str(excp))
407  if len(urlSet) > 0 and len(params) == 0:
408  logger.debug("Zero urls are collected for len(urlSet): %s", str(len(urlSet)))
409  elif len(params) > 0:
410  logger.debug("Collected and send to insert as new: %s", str(len(urlSet)))
411  if not readOnly:
412 
413  if self.dbWrapper is not None:
414  self.dbWrapper.urlNew(params)
415  self.dbWrapper.urlNew(chainUrls)
416  self.urlProcess.updateTypeForURLObjects(chainUrls)
417  self.dbWrapper.collectedURLsRecalculating(localSiteId)
418 
419  if formFields is not None and self.postForms is not None and self.dbWrapper is not None:
420  fieldParams = self.getFieldParams(formFields, self.postForms, localSiteId)
421  self.insertNewSiteProperties(fieldParams, self.dbWrapper, localSiteId)
422 
423  # logger.debug("Return from collectURLs:\n%s\n%s\n%s\n%s\n%s\n%s", str(nextStep), str(internalLinks),
424  # str(externalLinks), str(params), str(self.feedItems), str(chainUrls))
425 
426  return nextStep, internalLinks, externalLinks, params, self.feedItems, chainUrls
427 
428 
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
Here is the call graph for this function:
Here is the caller graph for this function:

◆ processProcessor()

def dc_crawler.CollectURLs.CollectURLs.processProcessor (   self,
  urlSet,
  dom,
  urlXpathList,
  urlObj 
)

Definition at line 474 of file CollectURLs.py.

474  def processProcessor(self, urlSet, dom, urlXpathList, urlObj):
475  if (self.processorName == PCONSTS.PROCESSOR_FEED_PARSER or self.processorName == PCONSTS.PROCESSOR_RSS) \
476  and urlObj.type != dc.EventObjects.URL.TYPE_FETCHED:
477  if feedparser is not None:
478  try:
479  self.feedItems = []
480  # Add one more date parsing handler function to fix some wrong datetime format cases; added by bgv
481  feedparser.registerDateHandler(self.feedparserParseDateFixes)
482 
483  # Remove handlers to process all tags as unknown to save their names unchanged
484  if not (self.siteProperties is not None and "RSS_FEEDPARSER_MODE" in self.siteProperties and \
485  int(self.siteProperties["RSS_FEEDPARSER_MODE"]) > 0):
486  import inspect
487  # , "_start_guid"
488  excludes = ["_start_rss", "_start_channel", "_start_feed", "_start_item", "_start_link",
489  "_start_admin_errorreportsto", "_start_admin_generatoragent", "_start_guid", "_start_id",
490  "_start_entry", "_start_enclosure"]
491  for methodName, functionObject in inspect.getmembers(feedparser._FeedParserMixin, predicate=inspect.ismethod): # pylint: disable=W0612,W0212,C0301
492  if methodName.startswith("_start_") and methodName not in excludes:
493  delattr(feedparser._FeedParserMixin, methodName) # pylint: disable=W0212
494  endMethodName = methodName.replace("_start_", "_end_")
495  if hasattr(feedparser._FeedParserMixin, endMethodName): # pylint: disable=W0212
496  delattr(feedparser._FeedParserMixin, endMethodName) # pylint: disable=W0212
497  setattr(feedparser._FeedParserMixin, "_normalize_attributes", self._normalize_attributes) # pylint: disable=W0212
498  feedparser.FeedParserDict.keymap["guid"] = "guid"
499  logger.debug("Feedparser in modified mode")
500  else:
501  logger.debug("Feedparser in native mode")
502 
503  self.feed = feedparser.parse(self.crawledResource.html_content)
504  urlSet.update(entry.link for entry in self.feed.entries)
505  # logger.debug("feed.entries: %s for url: %s\nfeed=\n%s\nurlSet:\n%s", str(len(self.feed.entries)),
506  # str(urlObj.url), str(dict(self.feed)), str(urlSet))
507  if len(self.feed.entries) == 0:
508  logger.debug("Zero entries in feed, self.crawledResource:\n%s", varDump(self.crawledResource))
509  # logger.debug("self.processContentTypes: %s", str(self.processContentTypes))
510  logger.debug("self.crawledResource.content_type = %s", str(self.crawledResource.content_type))
511  if self.crawledResource.content_type == dc.EventObjects.URL.CONTENT_TYPE_TEXT_HTML:
512  urlObj.errorMask |= APP_CONSTS.ERROR_MASK_SITE_UNSUPPORTED_CONTENT_TYPE
513 
514  except TypeError as err:
515  logger.debug("WRONG CONTENT FOR URL <" + str(urlObj.url) + "> not rss feed. " + str(err.message))
516  except Exception as err:
517  logger.debug("SOME ERROR WITH rss feed parse " + str(err.message))
518  else:
519  logger.debug("feedparser module not found")
520  # won't collect urls from rss feed resources
521  elif self.processorName != PCONSTS.PROCESSOR_RSS:
522  # Added support of urlXpathList as site's properties
523  if len(urlXpathList) > 0:
524  logger.debug("Site has COLLECT_URLS_XPATH_LIST property: %s", str(urlXpathList))
525  else:
526  # Set urls xpath list
527  urlXpathList = {'sets': {'': self.urlsXpathList}}
528  logger.debug("Site has no COLLECT_URLS_XPATH_LIST property, default xpath list used: %s", str(urlXpathList))
529  if 'sets' in urlXpathList and isinstance(urlXpathList['sets'], dict):
530  matchedSets = 0
531  if 'date_format' in urlXpathList:
532  dformat = str(urlXpathList['date_format'])
533  else:
534  dformat = '%Y-%m-%d %H:%M:%S'
535  for rexpr in urlXpathList['sets']:
536  if rexpr == '' or re.search(rexpr, urlObj.url) is not None:
537  if 'mode' in urlXpathList and int(urlXpathList['mode']) == 1:
538  xpathl = self.urlsXpathList + urlXpathList['sets'][rexpr]
539  else:
540  xpathl = urlXpathList['sets'][rexpr]
541  matchedSets += 1
542  for xpath in xpathl:
543  xpath = self.evaluateDateMacro(xpath, dformat)
544  elem = dom.xpath(xpath)
545  elem_type = type(elem)
546  if elem_type == list and len(elem) > 0 and hasattr(elem[0], "tail"):
547  urlSet.update([el.tail for el in elem])
548  elif elem_type == list and len(elem) > 0 and isinstance(elem[0], lxml.html.HtmlElement):
549  urlSet.update([el.text for el in elem])
550  else:
551  urlSet.update(elem)
552  if matchedSets == 0:
553  logger.debug("Warning! No one xpath set matched URL %s, URLs not collected!", urlObj.url)
554  else:
555  logger.debug('Wrong COLLECT_URLS_XPATH_LIST property, `sets` key with dict() of re->xpath_list[] expected!' + \
556  ' Collect URLs aborted!')
557  # logger.debug("urlSet: %s", str(urlSet))
558 
559 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ autoDetectMime

dc_crawler.CollectURLs.CollectURLs.autoDetectMime

Definition at line 65 of file CollectURLs.py.

◆ autoRemoveProps

dc_crawler.CollectURLs.CollectURLs.autoRemoveProps

Definition at line 64 of file CollectURLs.py.

◆ baseUrl

dc_crawler.CollectURLs.CollectURLs.baseUrl

Definition at line 55 of file CollectURLs.py.

◆ batchItem

dc_crawler.CollectURLs.CollectURLs.batchItem

Definition at line 57 of file CollectURLs.py.

◆ BINARY_CONTENT_TYPE_PATTERN

dc_crawler.CollectURLs.CollectURLs.BINARY_CONTENT_TYPE_PATTERN = re.compile('(text)|(xml)', re.I)
static

Definition at line 40 of file CollectURLs.py.

◆ COLLECT_POST_DATA

string dc_crawler.CollectURLs.CollectURLs.COLLECT_POST_DATA = "1"
static

Definition at line 42 of file CollectURLs.py.

◆ COLLECT_POST_DATA_NAME

string dc_crawler.CollectURLs.CollectURLs.COLLECT_POST_DATA_NAME = "COLLECT_POST_DATA"
static

Definition at line 41 of file CollectURLs.py.

◆ crawledResource

dc_crawler.CollectURLs.CollectURLs.crawledResource

Definition at line 51 of file CollectURLs.py.

◆ dbWrapper

dc_crawler.CollectURLs.CollectURLs.dbWrapper

Definition at line 63 of file CollectURLs.py.

◆ DC_URLS_TABLE_PREFIX

string dc_crawler.CollectURLs.CollectURLs.DC_URLS_TABLE_PREFIX = "urls_"
static

Definition at line 43 of file CollectURLs.py.

◆ DETECT_MIME_COLLECTED_URL

string dc_crawler.CollectURLs.CollectURLs.DETECT_MIME_COLLECTED_URL = "2"
static

Definition at line 46 of file CollectURLs.py.

◆ DETECT_MIME_MAIN_CONTENT

string dc_crawler.CollectURLs.CollectURLs.DETECT_MIME_MAIN_CONTENT = "1"
static

Definition at line 45 of file CollectURLs.py.

◆ DETECT_MIME_TIMEOUT

int dc_crawler.CollectURLs.CollectURLs.DETECT_MIME_TIMEOUT = 1
static

Definition at line 47 of file CollectURLs.py.

◆ dom

dc_crawler.CollectURLs.CollectURLs.dom

Definition at line 53 of file CollectURLs.py.

◆ feed

dc_crawler.CollectURLs.CollectURLs.feed

Definition at line 60 of file CollectURLs.py.

◆ feedItems

dc_crawler.CollectURLs.CollectURLs.feedItems

Definition at line 59 of file CollectURLs.py.

◆ isAbortedByTTL

dc_crawler.CollectURLs.CollectURLs.isAbortedByTTL

Definition at line 71 of file CollectURLs.py.

◆ PATTERN_WITH_PROTOCOL

dc_crawler.CollectURLs.CollectURLs.PATTERN_WITH_PROTOCOL = re.compile('[a-zA-Z]+:(//)?')
static

Definition at line 44 of file CollectURLs.py.

◆ postForms

dc_crawler.CollectURLs.CollectURLs.postForms

Definition at line 67 of file CollectURLs.py.

◆ processContentTypes

dc_crawler.CollectURLs.CollectURLs.processContentTypes

Definition at line 66 of file CollectURLs.py.

◆ processorName

dc_crawler.CollectURLs.CollectURLs.processorName

Definition at line 56 of file CollectURLs.py.

◆ realUrl

dc_crawler.CollectURLs.CollectURLs.realUrl

Definition at line 54 of file CollectURLs.py.

◆ robotsParser

dc_crawler.CollectURLs.CollectURLs.robotsParser

Definition at line 69 of file CollectURLs.py.

◆ site

dc_crawler.CollectURLs.CollectURLs.site

Definition at line 62 of file CollectURLs.py.

◆ siteProperties

dc_crawler.CollectURLs.CollectURLs.siteProperties

Definition at line 61 of file CollectURLs.py.

◆ url

dc_crawler.CollectURLs.CollectURLs.url

Definition at line 52 of file CollectURLs.py.

◆ urlProcess

dc_crawler.CollectURLs.CollectURLs.urlProcess

Definition at line 68 of file CollectURLs.py.

◆ urlsXpathList

dc_crawler.CollectURLs.CollectURLs.urlsXpathList

Definition at line 70 of file CollectURLs.py.

◆ urlXpathList

dc_crawler.CollectURLs.CollectURLs.urlXpathList

Definition at line 58 of file CollectURLs.py.


The documentation for this class was generated from the following file: